1285SN/A// SPDX-License-Identifier: 0BSD 2726Saefimov 3285SN/A/////////////////////////////////////////////////////////////////////////////// 4285SN/A// 5285SN/A/// \file tuklib_integer.h 6285SN/A/// \brief Various integer and bit operations 7285SN/A/// 8285SN/A/// This file provides macros or functions to do some basic integer and bit 9285SN/A/// operations. 10285SN/A/// 11285SN/A/// Native endian inline functions (XX = 16, 32, or 64): 12285SN/A/// - Unaligned native endian reads: readXXne(ptr) 13285SN/A/// - Unaligned native endian writes: writeXXne(ptr, num) 14285SN/A/// - Aligned native endian reads: aligned_readXXne(ptr) 15285SN/A/// - Aligned native endian writes: aligned_writeXXne(ptr, num) 16285SN/A/// 17285SN/A/// Endianness-converting integer operations (these can be macros!) 18285SN/A/// (XX = 16, 32, or 64; Y = b or l): 19285SN/A/// - Byte swapping: byteswapXX(num) 20285SN/A/// - Byte order conversions to/from native (byteswaps if Y isn't 21285SN/A/// the native endianness): convXXYe(num) 22285SN/A/// - Unaligned reads: readXXYe(ptr) 23285SN/A/// - Unaligned writes: writeXXYe(ptr, num) 24285SN/A/// - Aligned reads: aligned_readXXYe(ptr) 25285SN/A/// - Aligned writes: aligned_writeXXYe(ptr, num) 26285SN/A/// 27285SN/A/// Since the above can macros, the arguments should have no side effects 28285SN/A/// because they may be evaluated more than once. 29726Saefimov/// 30726Saefimov/// Bit scan operations for non-zero 32-bit integers (inline functions): 31726Saefimov/// - Bit scan reverse (find highest non-zero bit): bsr32(num) 32726Saefimov/// - Count leading zeros: clz32(num) 33726Saefimov/// - Count trailing zeros: ctz32(num) 34726Saefimov/// - Bit scan forward (simply an alias for ctz32()): bsf32(num) 35726Saefimov/// 36726Saefimov/// The above bit scan operations return 0-31. If num is zero, 37285SN/A/// the result is undefined. 38285SN/A// 39285SN/A// Authors: Lasse Collin 40285SN/A// Joachim Henke 41285SN/A// 42285SN/A/////////////////////////////////////////////////////////////////////////////// 43285SN/A 44285SN/A#ifndef TUKLIB_INTEGER_H 45285SN/A#define TUKLIB_INTEGER_H 46285SN/A 47285SN/A#include "tuklib_common.h" 48285SN/A#include <string.h> 49285SN/A 50285SN/A// Newer Intel C compilers require immintrin.h for _bit_scan_reverse() 51285SN/A// and such functions. 52285SN/A#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) 53285SN/A# include <immintrin.h> 54285SN/A// Only include <intrin.h> when it is needed. GCC and Clang can both 55285SN/A// use __builtin's, so we only need Windows instrincs when using MSVC. 56285SN/A// GCC and Clang can set _MSC_VER on Windows, so we need to exclude these 57285SN/A// cases explicitly. 58285SN/A#elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__) 59285SN/A# include <intrin.h> 60285SN/A#endif 61285SN/A 62285SN/A 63285SN/A/////////////////// 64285SN/A// Byte swapping // 65285SN/A/////////////////// 66285SN/A 67285SN/A#if defined(HAVE___BUILTIN_BSWAPXX) 68285SN/A // GCC >= 4.8 and Clang 69285SN/A# define byteswap16(num) __builtin_bswap16(num) 70285SN/A# define byteswap32(num) __builtin_bswap32(num) 71285SN/A# define byteswap64(num) __builtin_bswap64(num) 72285SN/A 73285SN/A#elif defined(HAVE_BYTESWAP_H) 74285SN/A // glibc, uClibc, dietlibc 75285SN/A# include <byteswap.h> 76285SN/A# ifdef HAVE_BSWAP_16 77285SN/A# define byteswap16(num) bswap_16(num) 78285SN/A# endif 79396SN/A# ifdef HAVE_BSWAP_32 80285SN/A# define byteswap32(num) bswap_32(num) 81285SN/A# endif 82285SN/A# ifdef HAVE_BSWAP_64 83285SN/A# define byteswap64(num) bswap_64(num) 84285SN/A# endif 85285SN/A 86285SN/A#elif defined(HAVE_SYS_ENDIAN_H) 87285SN/A // *BSDs and Darwin 88285SN/A# include <sys/endian.h> 89285SN/A# define byteswap16(num) bswap16(num) 90285SN/A# define byteswap32(num) bswap32(num) 91285SN/A# define byteswap64(num) bswap64(num) 92285SN/A 93285SN/A#elif defined(HAVE_SYS_BYTEORDER_H) 94285SN/A // Solaris 95285SN/A# include <sys/byteorder.h> 96285SN/A# ifdef BSWAP_16 97285SN/A# define byteswap16(num) BSWAP_16(num) 98285SN/A# endif 99285SN/A# ifdef BSWAP_32 100396SN/A# define byteswap32(num) BSWAP_32(num) 101396SN/A# endif 102396SN/A# ifdef BSWAP_64 103396SN/A# define byteswap64(num) BSWAP_64(num) 104396SN/A# endif 105396SN/A# ifdef BE_16 106396SN/A# define conv16be(num) BE_16(num) 107396SN/A# endif 108396SN/A# ifdef BE_32 109396SN/A# define conv32be(num) BE_32(num) 110396SN/A# endif 111396SN/A# ifdef BE_64 112396SN/A# define conv64be(num) BE_64(num) 113396SN/A# endif 114396SN/A# ifdef LE_16 115396SN/A# define conv16le(num) LE_16(num) 116396SN/A# endif 117396SN/A# ifdef LE_32 118396SN/A# define conv32le(num) LE_32(num) 119396SN/A# endif 120396SN/A# ifdef LE_64 121396SN/A# define conv64le(num) LE_64(num) 122396SN/A# endif 123396SN/A#endif 124396SN/A 125396SN/A#ifndef byteswap16 126396SN/A# define byteswap16(n) (uint16_t)( \ 127396SN/A (((n) & 0x00FFU) << 8) \ 128396SN/A | (((n) & 0xFF00U) >> 8) \ 129396SN/A ) 130396SN/A#endif 131396SN/A 132396SN/A#ifndef byteswap32 133396SN/A# define byteswap32(n) (uint32_t)( \ 134396SN/A (((n) & UINT32_C(0x000000FF)) << 24) \ 135396SN/A | (((n) & UINT32_C(0x0000FF00)) << 8) \ 136396SN/A | (((n) & UINT32_C(0x00FF0000)) >> 8) \ 137396SN/A | (((n) & UINT32_C(0xFF000000)) >> 24) \ 138396SN/A ) 139396SN/A#endif 140396SN/A 141396SN/A#ifndef byteswap64 142396SN/A# define byteswap64(n) (uint64_t)( \ 143396SN/A (((n) & UINT64_C(0x00000000000000FF)) << 56) \ 144396SN/A | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ 145396SN/A | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ 146396SN/A | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ 147396SN/A | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ 148396SN/A | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ 149396SN/A | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ 150396SN/A | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ 151285SN/A ) 152285SN/A#endif 153285SN/A 154285SN/A// Define conversion macros using the basic byte swapping macros. 155285SN/A#ifdef WORDS_BIGENDIAN 156285SN/A# ifndef conv16be 157285SN/A# define conv16be(num) ((uint16_t)(num)) 158285SN/A# endif 159285SN/A# ifndef conv32be 160285SN/A# define conv32be(num) ((uint32_t)(num)) 161285SN/A# endif 162285SN/A# ifndef conv64be 163285SN/A# define conv64be(num) ((uint64_t)(num)) 164285SN/A# endif 165396SN/A# ifndef conv16le 166285SN/A# define conv16le(num) byteswap16(num) 167396SN/A# endif 168396SN/A# ifndef conv32le 169285SN/A# define conv32le(num) byteswap32(num) 170285SN/A# endif 171396SN/A# ifndef conv64le 172285SN/A# define conv64le(num) byteswap64(num) 173396SN/A# endif 174396SN/A#else 175396SN/A# ifndef conv16be 176285SN/A# define conv16be(num) byteswap16(num) 177285SN/A# endif 178285SN/A# ifndef conv32be 179285SN/A# define conv32be(num) byteswap32(num) 180285SN/A# endif 181285SN/A# ifndef conv64be 182285SN/A# define conv64be(num) byteswap64(num) 183285SN/A# endif 184285SN/A# ifndef conv16le 185396SN/A# define conv16le(num) ((uint16_t)(num)) 186285SN/A# endif 187285SN/A# ifndef conv32le 188285SN/A# define conv32le(num) ((uint32_t)(num)) 189285SN/A# endif 190285SN/A# ifndef conv64le 191285SN/A# define conv64le(num) ((uint64_t)(num)) 192285SN/A# endif 193285SN/A#endif 194285SN/A 195285SN/A 196285SN/A//////////////////////////////// 197285SN/A// Unaligned reads and writes // 198285SN/A//////////////////////////////// 199285SN/A 200285SN/A// No-strict-align archs like x86-64 201285SN/A// --------------------------------- 202285SN/A// 203285SN/A// The traditional way of casting e.g. *(const uint16_t *)uint8_pointer 204285SN/A// is bad even if the uint8_pointer is properly aligned because this kind 205396SN/A// of casts break strict aliasing rules and result in undefined behavior. 206285SN/A// With unaligned pointers it's even worse: compilers may emit vector 207396SN/A// instructions that require aligned pointers even if non-vector 208396SN/A// instructions work with unaligned pointers. 209396SN/A// 210285SN/A// Using memcpy() is the standard compliant way to do unaligned access. 211285SN/A// Many modern compilers inline it so there is no function call overhead. 212285SN/A// For those compilers that don't handle the memcpy() method well, the 213285SN/A// old casting method (that violates strict aliasing) can be requested at 214285SN/A// build time. A third method, casting to a packed struct, would also be 215285SN/A// an option but isn't provided to keep things simpler (it's already a mess). 216285SN/A// Hopefully this is flexible enough in practice. 217285SN/A// 218285SN/A// Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that 219285SN/A// 220285SN/A// buf[0] | (buf[1] << 8) 221285SN/A// 222285SN/A// reads a 16-bit value and can emit a single 16-bit load and produce 223285SN/A// identical code than with the memcpy() method. In other cases Clang and GCC 224285SN/A// produce either the same or better code with memcpy(). For example, Clang 9 225285SN/A// on x86-64 can detect 32-bit load but not 16-bit load. 226285SN/A// 227285SN/A// MSVC uses unaligned access with the memcpy() method but emits byte-by-byte 228396SN/A// code for "buf[0] | (buf[1] << 8)". 229396SN/A// 230396SN/A// Conclusion: The memcpy() method is the best choice when unaligned access 231396SN/A// is supported. 232396SN/A// 233285SN/A// Strict-align archs like SPARC 234285SN/A// ----------------------------- 235285SN/A// 236285SN/A// GCC versions from around 4.x to to at least 13.2.0 produce worse code 237285SN/A// from the memcpy() method than from simple byte-by-byte shift-or code 238285SN/A// when reading a 32-bit integer: 239285SN/A// 240285SN/A// (1) It may be constructed on stack using using four 8-bit loads, 241285SN/A// four 8-bit stores to stack, and finally one 32-bit load from stack. 242285SN/A// 243285SN/A// (2) Especially with -Os, an actual memcpy() call may be emitted. 244285SN/A// 245285SN/A// This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and 246285SN/A// RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in 247396SN/A// some processors but not all so this is relevant only in the case when 248285SN/A// GCC assumes that unaligned is not supported or -mstrict-align or 249396SN/A// -mno-unaligned-access is used. 250396SN/A// 251285SN/A// For Clang it makes little difference. ARM64 with -O2 -mstrict-align 252285SN/A// was one the very few with a minor difference: the memcpy() version 253396SN/A// was one instruction longer. 254285SN/A// 255285SN/A// Conclusion: At least in case of GCC and Clang, byte-by-byte code is 256396SN/A// the best choice for strict-align archs to do unaligned access. 257285SN/A// 258285SN/A// See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502 259285SN/A// 260285SN/A// Thanks to <https://godbolt.org/> it was easy to test different compilers. 261285SN/A// The following is for little endian targets: 262285SN/A/* 263396SN/A#include <stdint.h> 264285SN/A#include <string.h> 265396SN/A 266396SN/Auint32_t bytes16(const uint8_t *b) 267396SN/A{ 268396SN/A return (uint32_t)b[0] 269396SN/A | ((uint32_t)b[1] << 8); 270396SN/A} 271396SN/A 272285SN/Auint32_t copy16(const uint8_t *b) 273285SN/A{ 274285SN/A uint16_t v; 275285SN/A memcpy(&v, b, sizeof(v)); 276285SN/A return v; 277285SN/A} 278285SN/A 279285SN/Auint32_t bytes32(const uint8_t *b) 280285SN/A{ 281285SN/A return (uint32_t)b[0] 282285SN/A | ((uint32_t)b[1] << 8) 283285SN/A | ((uint32_t)b[2] << 16) 284285SN/A | ((uint32_t)b[3] << 24); 285285SN/A} 286285SN/A 287285SN/Auint32_t copy32(const uint8_t *b) 288285SN/A{ 289285SN/A uint32_t v; 290285SN/A memcpy(&v, b, sizeof(v)); 291285SN/A return v; 292285SN/A} 293285SN/A 294285SN/Avoid wbytes16(uint8_t *b, uint16_t v) 295285SN/A{ 296396SN/A b[0] = (uint8_t)v; 297396SN/A b[1] = (uint8_t)(v >> 8); 298396SN/A} 299396SN/A 300396SN/Avoid wcopy16(uint8_t *b, uint16_t v) 301285SN/A{ 302285SN/A memcpy(b, &v, sizeof(v)); 303285SN/A} 304285SN/A 305285SN/Avoid wbytes32(uint8_t *b, uint32_t v) 306285SN/A{ 307285SN/A b[0] = (uint8_t)v; 308285SN/A b[1] = (uint8_t)(v >> 8); 309285SN/A b[2] = (uint8_t)(v >> 16); 310285SN/A b[3] = (uint8_t)(v >> 24); 311285SN/A} 312285SN/A 313285SN/Avoid wcopy32(uint8_t *b, uint32_t v) 314285SN/A{ 315285SN/A memcpy(b, &v, sizeof(v)); 316285SN/A} 317285SN/A*/ 318285SN/A 319285SN/A 320285SN/A#ifdef TUKLIB_FAST_UNALIGNED_ACCESS 321285SN/A 322285SN/Astatic inline uint16_t 323285SN/Aread16ne(const uint8_t *buf) 324285SN/A{ 325285SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 326285SN/A return *(const uint16_t *)buf; 327285SN/A#else 328285SN/A uint16_t num; 329285SN/A memcpy(&num, buf, sizeof(num)); 330285SN/A return num; 331285SN/A#endif 332285SN/A} 333285SN/A 334285SN/A 335285SN/Astatic inline uint32_t 336285SN/Aread32ne(const uint8_t *buf) 337285SN/A{ 338285SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 339285SN/A return *(const uint32_t *)buf; 340285SN/A#else 341396SN/A uint32_t num; 342396SN/A memcpy(&num, buf, sizeof(num)); 343396SN/A return num; 344396SN/A#endif 345396SN/A} 346396SN/A 347396SN/A 348396SN/Astatic inline uint64_t 349396SN/Aread64ne(const uint8_t *buf) 350396SN/A{ 351396SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 352396SN/A return *(const uint64_t *)buf; 353396SN/A#else 354396SN/A uint64_t num; 355396SN/A memcpy(&num, buf, sizeof(num)); 356396SN/A return num; 357396SN/A#endif 358396SN/A} 359396SN/A 360396SN/A 361396SN/Astatic inline void 362396SN/Awrite16ne(uint8_t *buf, uint16_t num) 363396SN/A{ 364396SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 365396SN/A *(uint16_t *)buf = num; 366396SN/A#else 367396SN/A memcpy(buf, &num, sizeof(num)); 368285SN/A#endif 369285SN/A return; 370285SN/A} 371285SN/A 372285SN/A 373285SN/Astatic inline void 374285SN/Awrite32ne(uint8_t *buf, uint32_t num) 375285SN/A{ 376285SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 377285SN/A *(uint32_t *)buf = num; 378285SN/A#else 379396SN/A memcpy(buf, &num, sizeof(num)); 380285SN/A#endif 381396SN/A return; 382396SN/A} 383285SN/A 384285SN/A 385285SN/Astatic inline void 386285SN/Awrite64ne(uint8_t *buf, uint64_t num) 387285SN/A{ 388285SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 389285SN/A *(uint64_t *)buf = num; 390285SN/A#else 391396SN/A memcpy(buf, &num, sizeof(num)); 392285SN/A#endif 393285SN/A return; 394396SN/A} 395396SN/A 396285SN/A 397285SN/Astatic inline uint16_t 398285SN/Aread16be(const uint8_t *buf) 399396SN/A{ 400285SN/A uint16_t num = read16ne(buf); 401396SN/A return conv16be(num); 402396SN/A} 403396SN/A 404396SN/A 405285SN/Astatic inline uint16_t 406285SN/Aread16le(const uint8_t *buf) 407285SN/A{ 408285SN/A uint16_t num = read16ne(buf); 409285SN/A return conv16le(num); 410285SN/A} 411285SN/A 412285SN/A 413285SN/Astatic inline uint32_t 414285SN/Aread32be(const uint8_t *buf) 415285SN/A{ 416285SN/A uint32_t num = read32ne(buf); 417285SN/A return conv32be(num); 418285SN/A} 419285SN/A 420285SN/A 421285SN/Astatic inline uint32_t 422285SN/Aread32le(const uint8_t *buf) 423285SN/A{ 424285SN/A uint32_t num = read32ne(buf); 425285SN/A return conv32le(num); 426285SN/A} 427285SN/A 428285SN/A 429285SN/Astatic inline uint64_t 430285SN/Aread64be(const uint8_t *buf) 431396SN/A{ 432396SN/A uint64_t num = read64ne(buf); 433726Saefimov return conv64be(num); 434396SN/A} 435396SN/A 436396SN/A 437396SN/Astatic inline uint64_t 438396SN/Aread64le(const uint8_t *buf) 439396SN/A{ 440396SN/A uint64_t num = read64ne(buf); 441396SN/A return conv64le(num); 442396SN/A} 443396SN/A 444396SN/A 445396SN/A// NOTE: Possible byte swapping must be done in a macro to allow the compiler 446396SN/A// to optimize byte swapping of constants when using glibc's or *BSD's 447396SN/A// byte swapping macros. The actual write is done in an inline function 448285SN/A// to make type checking of the buf pointer possible. 449285SN/A#define write16be(buf, num) write16ne(buf, conv16be(num)) 450285SN/A#define write32be(buf, num) write32ne(buf, conv32be(num)) 451285SN/A#define write64be(buf, num) write64ne(buf, conv64be(num)) 452285SN/A#define write16le(buf, num) write16ne(buf, conv16le(num)) 453396SN/A#define write32le(buf, num) write32ne(buf, conv32le(num)) 454396SN/A#define write64le(buf, num) write64ne(buf, conv64le(num)) 455396SN/A 456396SN/A#else 457396SN/A 458396SN/A#ifdef WORDS_BIGENDIAN 459396SN/A# define read16ne read16be 460285SN/A# define read32ne read32be 461285SN/A# define read64ne read64be 462285SN/A# define write16ne write16be 463285SN/A# define write32ne write32be 464285SN/A# define write64ne write64be 465285SN/A#else 466285SN/A# define read16ne read16le 467285SN/A# define read32ne read32le 468285SN/A# define read64ne read64le 469285SN/A# define write16ne write16le 470285SN/A# define write32ne write32le 471285SN/A# define write64ne write64le 472285SN/A#endif 473285SN/A 474285SN/A 475285SN/Astatic inline uint16_t 476285SN/Aread16be(const uint8_t *buf) 477285SN/A{ 478285SN/A uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; 479 return num; 480} 481 482 483static inline uint16_t 484read16le(const uint8_t *buf) 485{ 486 uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); 487 return num; 488} 489 490 491static inline uint32_t 492read32be(const uint8_t *buf) 493{ 494 uint32_t num = (uint32_t)buf[0] << 24; 495 num |= (uint32_t)buf[1] << 16; 496 num |= (uint32_t)buf[2] << 8; 497 num |= (uint32_t)buf[3]; 498 return num; 499} 500 501 502static inline uint32_t 503read32le(const uint8_t *buf) 504{ 505 uint32_t num = (uint32_t)buf[0]; 506 num |= (uint32_t)buf[1] << 8; 507 num |= (uint32_t)buf[2] << 16; 508 num |= (uint32_t)buf[3] << 24; 509 return num; 510} 511 512 513static inline uint64_t 514read64be(const uint8_t *buf) 515{ 516 uint64_t num = (uint64_t)buf[0] << 56; 517 num |= (uint64_t)buf[1] << 48; 518 num |= (uint64_t)buf[2] << 40; 519 num |= (uint64_t)buf[3] << 32; 520 num |= (uint64_t)buf[4] << 24; 521 num |= (uint64_t)buf[5] << 16; 522 num |= (uint64_t)buf[6] << 8; 523 num |= (uint64_t)buf[7]; 524 return num; 525} 526 527 528static inline uint64_t 529read64le(const uint8_t *buf) 530{ 531 uint64_t num = (uint64_t)buf[0]; 532 num |= (uint64_t)buf[1] << 8; 533 num |= (uint64_t)buf[2] << 16; 534 num |= (uint64_t)buf[3] << 24; 535 num |= (uint64_t)buf[4] << 32; 536 num |= (uint64_t)buf[5] << 40; 537 num |= (uint64_t)buf[6] << 48; 538 num |= (uint64_t)buf[7] << 56; 539 return num; 540} 541 542 543static inline void 544write16be(uint8_t *buf, uint16_t num) 545{ 546 buf[0] = (uint8_t)(num >> 8); 547 buf[1] = (uint8_t)num; 548 return; 549} 550 551 552static inline void 553write16le(uint8_t *buf, uint16_t num) 554{ 555 buf[0] = (uint8_t)num; 556 buf[1] = (uint8_t)(num >> 8); 557 return; 558} 559 560 561static inline void 562write32be(uint8_t *buf, uint32_t num) 563{ 564 buf[0] = (uint8_t)(num >> 24); 565 buf[1] = (uint8_t)(num >> 16); 566 buf[2] = (uint8_t)(num >> 8); 567 buf[3] = (uint8_t)num; 568 return; 569} 570 571 572static inline void 573write32le(uint8_t *buf, uint32_t num) 574{ 575 buf[0] = (uint8_t)num; 576 buf[1] = (uint8_t)(num >> 8); 577 buf[2] = (uint8_t)(num >> 16); 578 buf[3] = (uint8_t)(num >> 24); 579 return; 580} 581 582 583static inline void 584write64be(uint8_t *buf, uint64_t num) 585{ 586 buf[0] = (uint8_t)(num >> 56); 587 buf[1] = (uint8_t)(num >> 48); 588 buf[2] = (uint8_t)(num >> 40); 589 buf[3] = (uint8_t)(num >> 32); 590 buf[4] = (uint8_t)(num >> 24); 591 buf[5] = (uint8_t)(num >> 16); 592 buf[6] = (uint8_t)(num >> 8); 593 buf[7] = (uint8_t)num; 594 return; 595} 596 597 598static inline void 599write64le(uint8_t *buf, uint64_t num) 600{ 601 buf[0] = (uint8_t)num; 602 buf[1] = (uint8_t)(num >> 8); 603 buf[2] = (uint8_t)(num >> 16); 604 buf[3] = (uint8_t)(num >> 24); 605 buf[4] = (uint8_t)(num >> 32); 606 buf[5] = (uint8_t)(num >> 40); 607 buf[6] = (uint8_t)(num >> 48); 608 buf[7] = (uint8_t)(num >> 56); 609 return; 610} 611 612#endif 613 614 615////////////////////////////// 616// Aligned reads and writes // 617////////////////////////////// 618 619// Separate functions for aligned reads and writes are provided since on 620// strict-align archs aligned access is much faster than unaligned access. 621// 622// Just like in the unaligned case, memcpy() is needed to avoid 623// strict aliasing violations. However, on archs that don't support 624// unaligned access the compiler cannot know that the pointers given 625// to memcpy() are aligned which results in slow code. As of C11 there is 626// no standard way to tell the compiler that we know that the address is 627// aligned but some compilers have language extensions to do that. With 628// such language extensions the memcpy() method gives excellent results. 629// 630// What to do on a strict-align system when no known language extensions 631// are available? Falling back to byte-by-byte access would be safe but ruin 632// optimizations that have been made specifically with aligned access in mind. 633// As a compromise, aligned reads will fall back to non-compliant type punning 634// but aligned writes will be byte-by-byte, that is, fast reads are preferred 635// over fast writes. This obviously isn't great but hopefully it's a working 636// compromise for now. 637// 638// __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. 639#ifdef HAVE___BUILTIN_ASSUME_ALIGNED 640# define tuklib_memcpy_aligned(dest, src, size) \ 641 memcpy(dest, __builtin_assume_aligned(src, size), size) 642#else 643# define tuklib_memcpy_aligned(dest, src, size) \ 644 memcpy(dest, src, size) 645# ifndef TUKLIB_FAST_UNALIGNED_ACCESS 646# define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 647# endif 648#endif 649 650 651static inline uint16_t 652aligned_read16ne(const uint8_t *buf) 653{ 654#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 655 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 656 return *(const uint16_t *)buf; 657#else 658 uint16_t num; 659 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 660 return num; 661#endif 662} 663 664 665static inline uint32_t 666aligned_read32ne(const uint8_t *buf) 667{ 668#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 669 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 670 return *(const uint32_t *)buf; 671#else 672 uint32_t num; 673 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 674 return num; 675#endif 676} 677 678 679static inline uint64_t 680aligned_read64ne(const uint8_t *buf) 681{ 682#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 683 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 684 return *(const uint64_t *)buf; 685#else 686 uint64_t num; 687 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 688 return num; 689#endif 690} 691 692 693static inline void 694aligned_write16ne(uint8_t *buf, uint16_t num) 695{ 696#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 697 *(uint16_t *)buf = num; 698#else 699 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 700#endif 701 return; 702} 703 704 705static inline void 706aligned_write32ne(uint8_t *buf, uint32_t num) 707{ 708#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 709 *(uint32_t *)buf = num; 710#else 711 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 712#endif 713 return; 714} 715 716 717static inline void 718aligned_write64ne(uint8_t *buf, uint64_t num) 719{ 720#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 721 *(uint64_t *)buf = num; 722#else 723 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 724#endif 725 return; 726} 727 728 729static inline uint16_t 730aligned_read16be(const uint8_t *buf) 731{ 732 uint16_t num = aligned_read16ne(buf); 733 return conv16be(num); 734} 735 736 737static inline uint16_t 738aligned_read16le(const uint8_t *buf) 739{ 740 uint16_t num = aligned_read16ne(buf); 741 return conv16le(num); 742} 743 744 745static inline uint32_t 746aligned_read32be(const uint8_t *buf) 747{ 748 uint32_t num = aligned_read32ne(buf); 749 return conv32be(num); 750} 751 752 753static inline uint32_t 754aligned_read32le(const uint8_t *buf) 755{ 756 uint32_t num = aligned_read32ne(buf); 757 return conv32le(num); 758} 759 760 761static inline uint64_t 762aligned_read64be(const uint8_t *buf) 763{ 764 uint64_t num = aligned_read64ne(buf); 765 return conv64be(num); 766} 767 768 769static inline uint64_t 770aligned_read64le(const uint8_t *buf) 771{ 772 uint64_t num = aligned_read64ne(buf); 773 return conv64le(num); 774} 775 776 777// These need to be macros like in the unaligned case. 778#define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num)) 779#define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num)) 780#define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num)) 781#define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num)) 782#define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num)) 783#define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num)) 784 785 786//////////////////// 787// Bit operations // 788//////////////////// 789 790static inline uint32_t 791bsr32(uint32_t n) 792{ 793 // Check for ICC first, since it tends to define __GNUC__ too. 794#if defined(__INTEL_COMPILER) 795 return _bit_scan_reverse(n); 796 797#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 798 // GCC >= 3.4 has __builtin_clz(), which gives good results on 799 // multiple architectures. On x86, __builtin_clz() ^ 31U becomes 800 // either plain BSR (so the XOR gets optimized away) or LZCNT and 801 // XOR (if -march indicates that SSE4a instructions are supported). 802 return (uint32_t)__builtin_clz(n) ^ 31U; 803 804#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 805 uint32_t i; 806 __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n)); 807 return i; 808 809#elif defined(_MSC_VER) 810 unsigned long i; 811 _BitScanReverse(&i, n); 812 return i; 813 814#else 815 uint32_t i = 31; 816 817 if ((n & 0xFFFF0000) == 0) { 818 n <<= 16; 819 i = 15; 820 } 821 822 if ((n & 0xFF000000) == 0) { 823 n <<= 8; 824 i -= 8; 825 } 826 827 if ((n & 0xF0000000) == 0) { 828 n <<= 4; 829 i -= 4; 830 } 831 832 if ((n & 0xC0000000) == 0) { 833 n <<= 2; 834 i -= 2; 835 } 836 837 if ((n & 0x80000000) == 0) 838 --i; 839 840 return i; 841#endif 842} 843 844 845static inline uint32_t 846clz32(uint32_t n) 847{ 848#if defined(__INTEL_COMPILER) 849 return _bit_scan_reverse(n) ^ 31U; 850 851#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 852 return (uint32_t)__builtin_clz(n); 853 854#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 855 uint32_t i; 856 __asm__("bsrl %1, %0\n\t" 857 "xorl $31, %0" 858 : "=r" (i) : "rm" (n)); 859 return i; 860 861#elif defined(_MSC_VER) 862 unsigned long i; 863 _BitScanReverse(&i, n); 864 return i ^ 31U; 865 866#else 867 uint32_t i = 0; 868 869 if ((n & 0xFFFF0000) == 0) { 870 n <<= 16; 871 i = 16; 872 } 873 874 if ((n & 0xFF000000) == 0) { 875 n <<= 8; 876 i += 8; 877 } 878 879 if ((n & 0xF0000000) == 0) { 880 n <<= 4; 881 i += 4; 882 } 883 884 if ((n & 0xC0000000) == 0) { 885 n <<= 2; 886 i += 2; 887 } 888 889 if ((n & 0x80000000) == 0) 890 ++i; 891 892 return i; 893#endif 894} 895 896 897static inline uint32_t 898ctz32(uint32_t n) 899{ 900#if defined(__INTEL_COMPILER) 901 return _bit_scan_forward(n); 902 903#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX 904 return (uint32_t)__builtin_ctz(n); 905 906#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 907 uint32_t i; 908 __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n)); 909 return i; 910 911#elif defined(_MSC_VER) 912 unsigned long i; 913 _BitScanForward(&i, n); 914 return i; 915 916#else 917 uint32_t i = 0; 918 919 if ((n & 0x0000FFFF) == 0) { 920 n >>= 16; 921 i = 16; 922 } 923 924 if ((n & 0x000000FF) == 0) { 925 n >>= 8; 926 i += 8; 927 } 928 929 if ((n & 0x0000000F) == 0) { 930 n >>= 4; 931 i += 4; 932 } 933 934 if ((n & 0x00000003) == 0) { 935 n >>= 2; 936 i += 2; 937 } 938 939 if ((n & 0x00000001) == 0) 940 ++i; 941 942 return i; 943#endif 944} 945 946#define bsf32 ctz32 947 948#endif 949