1278307Srpaulo/////////////////////////////////////////////////////////////////////////////// 2278307Srpaulo// 3278307Srpaulo/// \file memcmplen.h 4278307Srpaulo/// \brief Optimized comparison of two buffers 5278307Srpaulo// 6278307Srpaulo// Author: Lasse Collin 7278307Srpaulo// 8278307Srpaulo// This file has been put into the public domain. 9278307Srpaulo// You can do whatever you want with this file. 10278307Srpaulo// 11278307Srpaulo/////////////////////////////////////////////////////////////////////////////// 12278307Srpaulo 13278307Srpaulo#ifndef LZMA_MEMCMPLEN_H 14278307Srpaulo#define LZMA_MEMCMPLEN_H 15278307Srpaulo 16278307Srpaulo#include "common.h" 17278307Srpaulo 18278307Srpaulo#ifdef HAVE_IMMINTRIN_H 19278307Srpaulo# include <immintrin.h> 20278307Srpaulo#endif 21278307Srpaulo 22278307Srpaulo 23278307Srpaulo/// Find out how many equal bytes the two buffers have. 24278307Srpaulo/// 25278307Srpaulo/// \param buf1 First buffer 26278307Srpaulo/// \param buf2 Second buffer 27278307Srpaulo/// \param len How many bytes have already been compared and will 28278307Srpaulo/// be assumed to match 29278307Srpaulo/// \param limit How many bytes to compare at most, including the 30278307Srpaulo/// already-compared bytes. This must be significantly 31278307Srpaulo/// smaller than UINT32_MAX to avoid integer overflows. 32278307Srpaulo/// Up to LZMA_MEMCMPLEN_EXTRA bytes may be read past 33278307Srpaulo/// the specified limit from both buf1 and buf2. 34278307Srpaulo/// 35278307Srpaulo/// \return Number of equal bytes in the buffers is returned. 36278307Srpaulo/// This is always at least len and at most limit. 37292588Sdelphij/// 38292588Sdelphij/// \note LZMA_MEMCMPLEN_EXTRA defines how many extra bytes may be read. 39292588Sdelphij/// It's rounded up to 2^n. This extra amount needs to be 40292588Sdelphij/// allocated in the buffers being used. It needs to be 41292588Sdelphij/// initialized too to keep Valgrind quiet. 42278307Srpaulostatic inline uint32_t lzma_attribute((__always_inline__)) 43278307Srpaulolzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2, 44278307Srpaulo uint32_t len, uint32_t limit) 45278307Srpaulo{ 46278307Srpaulo assert(len <= limit); 47278307Srpaulo assert(limit <= UINT32_MAX / 2); 48278307Srpaulo 49278307Srpaulo#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ 50278307Srpaulo && ((TUKLIB_GNUC_REQ(3, 4) && defined(__x86_64__)) \ 51278307Srpaulo || (defined(__INTEL_COMPILER) && defined(__x86_64__)) \ 52278307Srpaulo || (defined(__INTEL_COMPILER) && defined(_M_X64)) \ 53278307Srpaulo || (defined(_MSC_VER) && defined(_M_X64))) 54278307Srpaulo // NOTE: This will use 64-bit unaligned access which 55278307Srpaulo // TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit, but 56278307Srpaulo // it's convenient here at least as long as it's x86-64 only. 57278307Srpaulo // 58278307Srpaulo // I keep this x86-64 only for now since that's where I know this 59278307Srpaulo // to be a good method. This may be fine on other 64-bit CPUs too. 60278307Srpaulo // On big endian one should use xor instead of subtraction and switch 61278307Srpaulo // to __builtin_clzll(). 62292588Sdelphij#define LZMA_MEMCMPLEN_EXTRA 8 63278307Srpaulo while (len < limit) { 64278307Srpaulo const uint64_t x = *(const uint64_t *)(buf1 + len) 65278307Srpaulo - *(const uint64_t *)(buf2 + len); 66278307Srpaulo if (x != 0) { 67278307Srpaulo# if defined(_M_X64) // MSVC or Intel C compiler on Windows 68278307Srpaulo unsigned long tmp; 69278307Srpaulo _BitScanForward64(&tmp, x); 70278307Srpaulo len += (uint32_t)tmp >> 3; 71278307Srpaulo# else // GCC, clang, or Intel C compiler 72278307Srpaulo len += (uint32_t)__builtin_ctzll(x) >> 3; 73278307Srpaulo# endif 74278307Srpaulo return my_min(len, limit); 75278307Srpaulo } 76278307Srpaulo 77278307Srpaulo len += 8; 78278307Srpaulo } 79278307Srpaulo 80278307Srpaulo return limit; 81278307Srpaulo 82278307Srpaulo#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ 83278307Srpaulo && defined(HAVE__MM_MOVEMASK_EPI8) \ 84278307Srpaulo && ((defined(__GNUC__) && defined(__SSE2_MATH__)) \ 85278307Srpaulo || (defined(__INTEL_COMPILER) && defined(__SSE2__)) \ 86278307Srpaulo || (defined(_MSC_VER) && defined(_M_IX86_FP) \ 87278307Srpaulo && _M_IX86_FP >= 2)) 88278307Srpaulo // NOTE: Like above, this will use 128-bit unaligned access which 89278307Srpaulo // TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit. 90278307Srpaulo // 91278307Srpaulo // SSE2 version for 32-bit and 64-bit x86. On x86-64 the above 92278307Srpaulo // version is sometimes significantly faster and sometimes 93278307Srpaulo // slightly slower than this SSE2 version, so this SSE2 94278307Srpaulo // version isn't used on x86-64. 95292588Sdelphij# define LZMA_MEMCMPLEN_EXTRA 16 96278307Srpaulo while (len < limit) { 97278307Srpaulo const uint32_t x = 0xFFFF ^ _mm_movemask_epi8(_mm_cmpeq_epi8( 98278307Srpaulo _mm_loadu_si128((const __m128i *)(buf1 + len)), 99278307Srpaulo _mm_loadu_si128((const __m128i *)(buf2 + len)))); 100278307Srpaulo 101278307Srpaulo if (x != 0) { 102278307Srpaulo# if defined(__INTEL_COMPILER) 103278307Srpaulo len += _bit_scan_forward(x); 104278307Srpaulo# elif defined(_MSC_VER) 105278307Srpaulo unsigned long tmp; 106278307Srpaulo _BitScanForward(&tmp, x); 107278307Srpaulo len += tmp; 108278307Srpaulo# else 109278307Srpaulo len += __builtin_ctz(x); 110278307Srpaulo# endif 111278307Srpaulo return my_min(len, limit); 112278307Srpaulo } 113278307Srpaulo 114278307Srpaulo len += 16; 115278307Srpaulo } 116278307Srpaulo 117278307Srpaulo return limit; 118278307Srpaulo 119278307Srpaulo#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && !defined(WORDS_BIGENDIAN) 120278307Srpaulo // Generic 32-bit little endian method 121292588Sdelphij# define LZMA_MEMCMPLEN_EXTRA 4 122278307Srpaulo while (len < limit) { 123278307Srpaulo uint32_t x = *(const uint32_t *)(buf1 + len) 124278307Srpaulo - *(const uint32_t *)(buf2 + len); 125278307Srpaulo if (x != 0) { 126278307Srpaulo if ((x & 0xFFFF) == 0) { 127278307Srpaulo len += 2; 128278307Srpaulo x >>= 16; 129278307Srpaulo } 130278307Srpaulo 131278307Srpaulo if ((x & 0xFF) == 0) 132278307Srpaulo ++len; 133278307Srpaulo 134278307Srpaulo return my_min(len, limit); 135278307Srpaulo } 136278307Srpaulo 137278307Srpaulo len += 4; 138278307Srpaulo } 139278307Srpaulo 140278307Srpaulo return limit; 141278307Srpaulo 142278307Srpaulo#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && defined(WORDS_BIGENDIAN) 143278307Srpaulo // Generic 32-bit big endian method 144292588Sdelphij# define LZMA_MEMCMPLEN_EXTRA 4 145278307Srpaulo while (len < limit) { 146278307Srpaulo uint32_t x = *(const uint32_t *)(buf1 + len) 147278307Srpaulo ^ *(const uint32_t *)(buf2 + len); 148278307Srpaulo if (x != 0) { 149278307Srpaulo if ((x & 0xFFFF0000) == 0) { 150278307Srpaulo len += 2; 151278307Srpaulo x <<= 16; 152278307Srpaulo } 153278307Srpaulo 154278307Srpaulo if ((x & 0xFF000000) == 0) 155278307Srpaulo ++len; 156278307Srpaulo 157278307Srpaulo return my_min(len, limit); 158278307Srpaulo } 159278307Srpaulo 160278307Srpaulo len += 4; 161278307Srpaulo } 162278307Srpaulo 163278307Srpaulo return limit; 164278307Srpaulo 165278307Srpaulo#else 166278307Srpaulo // Simple portable version that doesn't use unaligned access. 167292588Sdelphij# define LZMA_MEMCMPLEN_EXTRA 0 168278307Srpaulo while (len < limit && buf1[len] == buf2[len]) 169278307Srpaulo ++len; 170278307Srpaulo 171278307Srpaulo return len; 172278307Srpaulo#endif 173278307Srpaulo} 174278307Srpaulo 175278307Srpaulo#endif 176