1// SPDX-License-Identifier: 0BSD 2 3/////////////////////////////////////////////////////////////////////////////// 4// 5/// \file tuklib_integer.h 6/// \brief Various integer and bit operations 7/// 8/// This file provides macros or functions to do some basic integer and bit 9/// operations. 10/// 11/// Native endian inline functions (XX = 16, 32, or 64): 12/// - Unaligned native endian reads: readXXne(ptr) 13/// - Unaligned native endian writes: writeXXne(ptr, num) 14/// - Aligned native endian reads: aligned_readXXne(ptr) 15/// - Aligned native endian writes: aligned_writeXXne(ptr, num) 16/// 17/// Endianness-converting integer operations (these can be macros!) 18/// (XX = 16, 32, or 64; Y = b or l): 19/// - Byte swapping: byteswapXX(num) 20/// - Byte order conversions to/from native (byteswaps if Y isn't 21/// the native endianness): convXXYe(num) 22/// - Unaligned reads: readXXYe(ptr) 23/// - Unaligned writes: writeXXYe(ptr, num) 24/// - Aligned reads: aligned_readXXYe(ptr) 25/// - Aligned writes: aligned_writeXXYe(ptr, num) 26/// 27/// Since the above can macros, the arguments should have no side effects 28/// because they may be evaluated more than once. 29/// 30/// Bit scan operations for non-zero 32-bit integers (inline functions): 31/// - Bit scan reverse (find highest non-zero bit): bsr32(num) 32/// - Count leading zeros: clz32(num) 33/// - Count trailing zeros: ctz32(num) 34/// - Bit scan forward (simply an alias for ctz32()): bsf32(num) 35/// 36/// The above bit scan operations return 0-31. If num is zero, 37/// the result is undefined. 38// 39// Authors: Lasse Collin 40// Joachim Henke 41// 42/////////////////////////////////////////////////////////////////////////////// 43 44#ifndef TUKLIB_INTEGER_H 45#define TUKLIB_INTEGER_H 46 47#include "tuklib_common.h" 48#include <string.h> 49 50// Newer Intel C compilers require immintrin.h for _bit_scan_reverse() 51// and such functions. 52#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) 53# include <immintrin.h> 54// Only include <intrin.h> when it is needed. GCC and Clang can both 55// use __builtin's, so we only need Windows instrincs when using MSVC. 56// GCC and Clang can set _MSC_VER on Windows, so we need to exclude these 57// cases explicitly. 58#elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__) 59# include <intrin.h> 60#endif 61 62 63/////////////////// 64// Byte swapping // 65/////////////////// 66 67#if defined(HAVE___BUILTIN_BSWAPXX) 68 // GCC >= 4.8 and Clang 69# define byteswap16(num) __builtin_bswap16(num) 70# define byteswap32(num) __builtin_bswap32(num) 71# define byteswap64(num) __builtin_bswap64(num) 72 73#elif defined(HAVE_BYTESWAP_H) 74 // glibc, uClibc, dietlibc 75# include <byteswap.h> 76# ifdef HAVE_BSWAP_16 77# define byteswap16(num) bswap_16(num) 78# endif 79# ifdef HAVE_BSWAP_32 80# define byteswap32(num) bswap_32(num) 81# endif 82# ifdef HAVE_BSWAP_64 83# define byteswap64(num) bswap_64(num) 84# endif 85 86#elif defined(HAVE_SYS_ENDIAN_H) 87 // *BSDs and Darwin 88# include <sys/endian.h> 89# define byteswap16(num) bswap16(num) 90# define byteswap32(num) bswap32(num) 91# define byteswap64(num) bswap64(num) 92 93#elif defined(HAVE_SYS_BYTEORDER_H) 94 // Solaris 95# include <sys/byteorder.h> 96# ifdef BSWAP_16 97# define byteswap16(num) BSWAP_16(num) 98# endif 99# ifdef BSWAP_32 100# define byteswap32(num) BSWAP_32(num) 101# endif 102# ifdef BSWAP_64 103# define byteswap64(num) BSWAP_64(num) 104# endif 105# ifdef BE_16 106# define conv16be(num) BE_16(num) 107# endif 108# ifdef BE_32 109# define conv32be(num) BE_32(num) 110# endif 111# ifdef BE_64 112# define conv64be(num) BE_64(num) 113# endif 114# ifdef LE_16 115# define conv16le(num) LE_16(num) 116# endif 117# ifdef LE_32 118# define conv32le(num) LE_32(num) 119# endif 120# ifdef LE_64 121# define conv64le(num) LE_64(num) 122# endif 123#endif 124 125#ifndef byteswap16 126# define byteswap16(n) (uint16_t)( \ 127 (((n) & 0x00FFU) << 8) \ 128 | (((n) & 0xFF00U) >> 8) \ 129 ) 130#endif 131 132#ifndef byteswap32 133# define byteswap32(n) (uint32_t)( \ 134 (((n) & UINT32_C(0x000000FF)) << 24) \ 135 | (((n) & UINT32_C(0x0000FF00)) << 8) \ 136 | (((n) & UINT32_C(0x00FF0000)) >> 8) \ 137 | (((n) & UINT32_C(0xFF000000)) >> 24) \ 138 ) 139#endif 140 141#ifndef byteswap64 142# define byteswap64(n) (uint64_t)( \ 143 (((n) & UINT64_C(0x00000000000000FF)) << 56) \ 144 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \ 145 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \ 146 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \ 147 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \ 148 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \ 149 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \ 150 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \ 151 ) 152#endif 153 154// Define conversion macros using the basic byte swapping macros. 155#ifdef WORDS_BIGENDIAN 156# ifndef conv16be 157# define conv16be(num) ((uint16_t)(num)) 158# endif 159# ifndef conv32be 160# define conv32be(num) ((uint32_t)(num)) 161# endif 162# ifndef conv64be 163# define conv64be(num) ((uint64_t)(num)) 164# endif 165# ifndef conv16le 166# define conv16le(num) byteswap16(num) 167# endif 168# ifndef conv32le 169# define conv32le(num) byteswap32(num) 170# endif 171# ifndef conv64le 172# define conv64le(num) byteswap64(num) 173# endif 174#else 175# ifndef conv16be 176# define conv16be(num) byteswap16(num) 177# endif 178# ifndef conv32be 179# define conv32be(num) byteswap32(num) 180# endif 181# ifndef conv64be 182# define conv64be(num) byteswap64(num) 183# endif 184# ifndef conv16le 185# define conv16le(num) ((uint16_t)(num)) 186# endif 187# ifndef conv32le 188# define conv32le(num) ((uint32_t)(num)) 189# endif 190# ifndef conv64le 191# define conv64le(num) ((uint64_t)(num)) 192# endif 193#endif 194 195 196//////////////////////////////// 197// Unaligned reads and writes // 198//////////////////////////////// 199 200// No-strict-align archs like x86-64 201// --------------------------------- 202// 203// The traditional way of casting e.g. *(const uint16_t *)uint8_pointer 204// is bad even if the uint8_pointer is properly aligned because this kind 205// of casts break strict aliasing rules and result in undefined behavior. 206// With unaligned pointers it's even worse: compilers may emit vector 207// instructions that require aligned pointers even if non-vector 208// instructions work with unaligned pointers. 209// 210// Using memcpy() is the standard compliant way to do unaligned access. 211// Many modern compilers inline it so there is no function call overhead. 212// For those compilers that don't handle the memcpy() method well, the 213// old casting method (that violates strict aliasing) can be requested at 214// build time. A third method, casting to a packed struct, would also be 215// an option but isn't provided to keep things simpler (it's already a mess). 216// Hopefully this is flexible enough in practice. 217// 218// Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that 219// 220// buf[0] | (buf[1] << 8) 221// 222// reads a 16-bit value and can emit a single 16-bit load and produce 223// identical code than with the memcpy() method. In other cases Clang and GCC 224// produce either the same or better code with memcpy(). For example, Clang 9 225// on x86-64 can detect 32-bit load but not 16-bit load. 226// 227// MSVC uses unaligned access with the memcpy() method but emits byte-by-byte 228// code for "buf[0] | (buf[1] << 8)". 229// 230// Conclusion: The memcpy() method is the best choice when unaligned access 231// is supported. 232// 233// Strict-align archs like SPARC 234// ----------------------------- 235// 236// GCC versions from around 4.x to to at least 13.2.0 produce worse code 237// from the memcpy() method than from simple byte-by-byte shift-or code 238// when reading a 32-bit integer: 239// 240// (1) It may be constructed on stack using using four 8-bit loads, 241// four 8-bit stores to stack, and finally one 32-bit load from stack. 242// 243// (2) Especially with -Os, an actual memcpy() call may be emitted. 244// 245// This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and 246// RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in 247// some processors but not all so this is relevant only in the case when 248// GCC assumes that unaligned is not supported or -mstrict-align or 249// -mno-unaligned-access is used. 250// 251// For Clang it makes little difference. ARM64 with -O2 -mstrict-align 252// was one the very few with a minor difference: the memcpy() version 253// was one instruction longer. 254// 255// Conclusion: At least in case of GCC and Clang, byte-by-byte code is 256// the best choice for strict-align archs to do unaligned access. 257// 258// See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502 259// 260// Thanks to <https://godbolt.org/> it was easy to test different compilers. 261// The following is for little endian targets: 262/* 263#include <stdint.h> 264#include <string.h> 265 266uint32_t bytes16(const uint8_t *b) 267{ 268 return (uint32_t)b[0] 269 | ((uint32_t)b[1] << 8); 270} 271 272uint32_t copy16(const uint8_t *b) 273{ 274 uint16_t v; 275 memcpy(&v, b, sizeof(v)); 276 return v; 277} 278 279uint32_t bytes32(const uint8_t *b) 280{ 281 return (uint32_t)b[0] 282 | ((uint32_t)b[1] << 8) 283 | ((uint32_t)b[2] << 16) 284 | ((uint32_t)b[3] << 24); 285} 286 287uint32_t copy32(const uint8_t *b) 288{ 289 uint32_t v; 290 memcpy(&v, b, sizeof(v)); 291 return v; 292} 293 294void wbytes16(uint8_t *b, uint16_t v) 295{ 296 b[0] = (uint8_t)v; 297 b[1] = (uint8_t)(v >> 8); 298} 299 300void wcopy16(uint8_t *b, uint16_t v) 301{ 302 memcpy(b, &v, sizeof(v)); 303} 304 305void wbytes32(uint8_t *b, uint32_t v) 306{ 307 b[0] = (uint8_t)v; 308 b[1] = (uint8_t)(v >> 8); 309 b[2] = (uint8_t)(v >> 16); 310 b[3] = (uint8_t)(v >> 24); 311} 312 313void wcopy32(uint8_t *b, uint32_t v) 314{ 315 memcpy(b, &v, sizeof(v)); 316} 317*/ 318 319 320#ifdef TUKLIB_FAST_UNALIGNED_ACCESS 321 322static inline uint16_t 323read16ne(const uint8_t *buf) 324{ 325#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 326 return *(const uint16_t *)buf; 327#else 328 uint16_t num; 329 memcpy(&num, buf, sizeof(num)); 330 return num; 331#endif 332} 333 334 335static inline uint32_t 336read32ne(const uint8_t *buf) 337{ 338#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 339 return *(const uint32_t *)buf; 340#else 341 uint32_t num; 342 memcpy(&num, buf, sizeof(num)); 343 return num; 344#endif 345} 346 347 348static inline uint64_t 349read64ne(const uint8_t *buf) 350{ 351#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 352 return *(const uint64_t *)buf; 353#else 354 uint64_t num; 355 memcpy(&num, buf, sizeof(num)); 356 return num; 357#endif 358} 359 360 361static inline void 362write16ne(uint8_t *buf, uint16_t num) 363{ 364#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 365 *(uint16_t *)buf = num; 366#else 367 memcpy(buf, &num, sizeof(num)); 368#endif 369 return; 370} 371 372 373static inline void 374write32ne(uint8_t *buf, uint32_t num) 375{ 376#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 377 *(uint32_t *)buf = num; 378#else 379 memcpy(buf, &num, sizeof(num)); 380#endif 381 return; 382} 383 384 385static inline void 386write64ne(uint8_t *buf, uint64_t num) 387{ 388#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 389 *(uint64_t *)buf = num; 390#else 391 memcpy(buf, &num, sizeof(num)); 392#endif 393 return; 394} 395 396 397static inline uint16_t 398read16be(const uint8_t *buf) 399{ 400 uint16_t num = read16ne(buf); 401 return conv16be(num); 402} 403 404 405static inline uint16_t 406read16le(const uint8_t *buf) 407{ 408 uint16_t num = read16ne(buf); 409 return conv16le(num); 410} 411 412 413static inline uint32_t 414read32be(const uint8_t *buf) 415{ 416 uint32_t num = read32ne(buf); 417 return conv32be(num); 418} 419 420 421static inline uint32_t 422read32le(const uint8_t *buf) 423{ 424 uint32_t num = read32ne(buf); 425 return conv32le(num); 426} 427 428 429static inline uint64_t 430read64be(const uint8_t *buf) 431{ 432 uint64_t num = read64ne(buf); 433 return conv64be(num); 434} 435 436 437static inline uint64_t 438read64le(const uint8_t *buf) 439{ 440 uint64_t num = read64ne(buf); 441 return conv64le(num); 442} 443 444 445// NOTE: Possible byte swapping must be done in a macro to allow the compiler 446// to optimize byte swapping of constants when using glibc's or *BSD's 447// byte swapping macros. The actual write is done in an inline function 448// to make type checking of the buf pointer possible. 449#define write16be(buf, num) write16ne(buf, conv16be(num)) 450#define write32be(buf, num) write32ne(buf, conv32be(num)) 451#define write64be(buf, num) write64ne(buf, conv64be(num)) 452#define write16le(buf, num) write16ne(buf, conv16le(num)) 453#define write32le(buf, num) write32ne(buf, conv32le(num)) 454#define write64le(buf, num) write64ne(buf, conv64le(num)) 455 456#else 457 458#ifdef WORDS_BIGENDIAN 459# define read16ne read16be 460# define read32ne read32be 461# define read64ne read64be 462# define write16ne write16be 463# define write32ne write32be 464# define write64ne write64be 465#else 466# define read16ne read16le 467# define read32ne read32le 468# define read64ne read64le 469# define write16ne write16le 470# define write32ne write32le 471# define write64ne write64le 472#endif 473 474 475static inline uint16_t 476read16be(const uint8_t *buf) 477{ 478 uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1]; 479 return num; 480} 481 482 483static inline uint16_t 484read16le(const uint8_t *buf) 485{ 486 uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); 487 return num; 488} 489 490 491static inline uint32_t 492read32be(const uint8_t *buf) 493{ 494 uint32_t num = (uint32_t)buf[0] << 24; 495 num |= (uint32_t)buf[1] << 16; 496 num |= (uint32_t)buf[2] << 8; 497 num |= (uint32_t)buf[3]; 498 return num; 499} 500 501 502static inline uint32_t 503read32le(const uint8_t *buf) 504{ 505 uint32_t num = (uint32_t)buf[0]; 506 num |= (uint32_t)buf[1] << 8; 507 num |= (uint32_t)buf[2] << 16; 508 num |= (uint32_t)buf[3] << 24; 509 return num; 510} 511 512 513static inline uint64_t 514read64be(const uint8_t *buf) 515{ 516 uint64_t num = (uint64_t)buf[0] << 56; 517 num |= (uint64_t)buf[1] << 48; 518 num |= (uint64_t)buf[2] << 40; 519 num |= (uint64_t)buf[3] << 32; 520 num |= (uint64_t)buf[4] << 24; 521 num |= (uint64_t)buf[5] << 16; 522 num |= (uint64_t)buf[6] << 8; 523 num |= (uint64_t)buf[7]; 524 return num; 525} 526 527 528static inline uint64_t 529read64le(const uint8_t *buf) 530{ 531 uint64_t num = (uint64_t)buf[0]; 532 num |= (uint64_t)buf[1] << 8; 533 num |= (uint64_t)buf[2] << 16; 534 num |= (uint64_t)buf[3] << 24; 535 num |= (uint64_t)buf[4] << 32; 536 num |= (uint64_t)buf[5] << 40; 537 num |= (uint64_t)buf[6] << 48; 538 num |= (uint64_t)buf[7] << 56; 539 return num; 540} 541 542 543static inline void 544write16be(uint8_t *buf, uint16_t num) 545{ 546 buf[0] = (uint8_t)(num >> 8); 547 buf[1] = (uint8_t)num; 548 return; 549} 550 551 552static inline void 553write16le(uint8_t *buf, uint16_t num) 554{ 555 buf[0] = (uint8_t)num; 556 buf[1] = (uint8_t)(num >> 8); 557 return; 558} 559 560 561static inline void 562write32be(uint8_t *buf, uint32_t num) 563{ 564 buf[0] = (uint8_t)(num >> 24); 565 buf[1] = (uint8_t)(num >> 16); 566 buf[2] = (uint8_t)(num >> 8); 567 buf[3] = (uint8_t)num; 568 return; 569} 570 571 572static inline void 573write32le(uint8_t *buf, uint32_t num) 574{ 575 buf[0] = (uint8_t)num; 576 buf[1] = (uint8_t)(num >> 8); 577 buf[2] = (uint8_t)(num >> 16); 578 buf[3] = (uint8_t)(num >> 24); 579 return; 580} 581 582 583static inline void 584write64be(uint8_t *buf, uint64_t num) 585{ 586 buf[0] = (uint8_t)(num >> 56); 587 buf[1] = (uint8_t)(num >> 48); 588 buf[2] = (uint8_t)(num >> 40); 589 buf[3] = (uint8_t)(num >> 32); 590 buf[4] = (uint8_t)(num >> 24); 591 buf[5] = (uint8_t)(num >> 16); 592 buf[6] = (uint8_t)(num >> 8); 593 buf[7] = (uint8_t)num; 594 return; 595} 596 597 598static inline void 599write64le(uint8_t *buf, uint64_t num) 600{ 601 buf[0] = (uint8_t)num; 602 buf[1] = (uint8_t)(num >> 8); 603 buf[2] = (uint8_t)(num >> 16); 604 buf[3] = (uint8_t)(num >> 24); 605 buf[4] = (uint8_t)(num >> 32); 606 buf[5] = (uint8_t)(num >> 40); 607 buf[6] = (uint8_t)(num >> 48); 608 buf[7] = (uint8_t)(num >> 56); 609 return; 610} 611 612#endif 613 614 615////////////////////////////// 616// Aligned reads and writes // 617////////////////////////////// 618 619// Separate functions for aligned reads and writes are provided since on 620// strict-align archs aligned access is much faster than unaligned access. 621// 622// Just like in the unaligned case, memcpy() is needed to avoid 623// strict aliasing violations. However, on archs that don't support 624// unaligned access the compiler cannot know that the pointers given 625// to memcpy() are aligned which results in slow code. As of C11 there is 626// no standard way to tell the compiler that we know that the address is 627// aligned but some compilers have language extensions to do that. With 628// such language extensions the memcpy() method gives excellent results. 629// 630// What to do on a strict-align system when no known language extensions 631// are available? Falling back to byte-by-byte access would be safe but ruin 632// optimizations that have been made specifically with aligned access in mind. 633// As a compromise, aligned reads will fall back to non-compliant type punning 634// but aligned writes will be byte-by-byte, that is, fast reads are preferred 635// over fast writes. This obviously isn't great but hopefully it's a working 636// compromise for now. 637// 638// __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6. 639#ifdef HAVE___BUILTIN_ASSUME_ALIGNED 640# define tuklib_memcpy_aligned(dest, src, size) \ 641 memcpy(dest, __builtin_assume_aligned(src, size), size) 642#else 643# define tuklib_memcpy_aligned(dest, src, size) \ 644 memcpy(dest, src, size) 645# ifndef TUKLIB_FAST_UNALIGNED_ACCESS 646# define TUKLIB_USE_UNSAFE_ALIGNED_READS 1 647# endif 648#endif 649 650 651static inline uint16_t 652aligned_read16ne(const uint8_t *buf) 653{ 654#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 655 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 656 return *(const uint16_t *)buf; 657#else 658 uint16_t num; 659 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 660 return num; 661#endif 662} 663 664 665static inline uint32_t 666aligned_read32ne(const uint8_t *buf) 667{ 668#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 669 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 670 return *(const uint32_t *)buf; 671#else 672 uint32_t num; 673 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 674 return num; 675#endif 676} 677 678 679static inline uint64_t 680aligned_read64ne(const uint8_t *buf) 681{ 682#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \ 683 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS) 684 return *(const uint64_t *)buf; 685#else 686 uint64_t num; 687 tuklib_memcpy_aligned(&num, buf, sizeof(num)); 688 return num; 689#endif 690} 691 692 693static inline void 694aligned_write16ne(uint8_t *buf, uint16_t num) 695{ 696#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 697 *(uint16_t *)buf = num; 698#else 699 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 700#endif 701 return; 702} 703 704 705static inline void 706aligned_write32ne(uint8_t *buf, uint32_t num) 707{ 708#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 709 *(uint32_t *)buf = num; 710#else 711 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 712#endif 713 return; 714} 715 716 717static inline void 718aligned_write64ne(uint8_t *buf, uint64_t num) 719{ 720#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING 721 *(uint64_t *)buf = num; 722#else 723 tuklib_memcpy_aligned(buf, &num, sizeof(num)); 724#endif 725 return; 726} 727 728 729static inline uint16_t 730aligned_read16be(const uint8_t *buf) 731{ 732 uint16_t num = aligned_read16ne(buf); 733 return conv16be(num); 734} 735 736 737static inline uint16_t 738aligned_read16le(const uint8_t *buf) 739{ 740 uint16_t num = aligned_read16ne(buf); 741 return conv16le(num); 742} 743 744 745static inline uint32_t 746aligned_read32be(const uint8_t *buf) 747{ 748 uint32_t num = aligned_read32ne(buf); 749 return conv32be(num); 750} 751 752 753static inline uint32_t 754aligned_read32le(const uint8_t *buf) 755{ 756 uint32_t num = aligned_read32ne(buf); 757 return conv32le(num); 758} 759 760 761static inline uint64_t 762aligned_read64be(const uint8_t *buf) 763{ 764 uint64_t num = aligned_read64ne(buf); 765 return conv64be(num); 766} 767 768 769static inline uint64_t 770aligned_read64le(const uint8_t *buf) 771{ 772 uint64_t num = aligned_read64ne(buf); 773 return conv64le(num); 774} 775 776 777// These need to be macros like in the unaligned case. 778#define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num)) 779#define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num)) 780#define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num)) 781#define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num)) 782#define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num)) 783#define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num)) 784 785 786//////////////////// 787// Bit operations // 788//////////////////// 789 790static inline uint32_t 791bsr32(uint32_t n) 792{ 793 // Check for ICC first, since it tends to define __GNUC__ too. 794#if defined(__INTEL_COMPILER) 795 return _bit_scan_reverse(n); 796 797#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 798 // GCC >= 3.4 has __builtin_clz(), which gives good results on 799 // multiple architectures. On x86, __builtin_clz() ^ 31U becomes 800 // either plain BSR (so the XOR gets optimized away) or LZCNT and 801 // XOR (if -march indicates that SSE4a instructions are supported). 802 return (uint32_t)__builtin_clz(n) ^ 31U; 803 804#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 805 uint32_t i; 806 __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n)); 807 return i; 808 809#elif defined(_MSC_VER) 810 unsigned long i; 811 _BitScanReverse(&i, n); 812 return i; 813 814#else 815 uint32_t i = 31; 816 817 if ((n & 0xFFFF0000) == 0) { 818 n <<= 16; 819 i = 15; 820 } 821 822 if ((n & 0xFF000000) == 0) { 823 n <<= 8; 824 i -= 8; 825 } 826 827 if ((n & 0xF0000000) == 0) { 828 n <<= 4; 829 i -= 4; 830 } 831 832 if ((n & 0xC0000000) == 0) { 833 n <<= 2; 834 i -= 2; 835 } 836 837 if ((n & 0x80000000) == 0) 838 --i; 839 840 return i; 841#endif 842} 843 844 845static inline uint32_t 846clz32(uint32_t n) 847{ 848#if defined(__INTEL_COMPILER) 849 return _bit_scan_reverse(n) ^ 31U; 850 851#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX 852 return (uint32_t)__builtin_clz(n); 853 854#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 855 uint32_t i; 856 __asm__("bsrl %1, %0\n\t" 857 "xorl $31, %0" 858 : "=r" (i) : "rm" (n)); 859 return i; 860 861#elif defined(_MSC_VER) 862 unsigned long i; 863 _BitScanReverse(&i, n); 864 return i ^ 31U; 865 866#else 867 uint32_t i = 0; 868 869 if ((n & 0xFFFF0000) == 0) { 870 n <<= 16; 871 i = 16; 872 } 873 874 if ((n & 0xFF000000) == 0) { 875 n <<= 8; 876 i += 8; 877 } 878 879 if ((n & 0xF0000000) == 0) { 880 n <<= 4; 881 i += 4; 882 } 883 884 if ((n & 0xC0000000) == 0) { 885 n <<= 2; 886 i += 2; 887 } 888 889 if ((n & 0x80000000) == 0) 890 ++i; 891 892 return i; 893#endif 894} 895 896 897static inline uint32_t 898ctz32(uint32_t n) 899{ 900#if defined(__INTEL_COMPILER) 901 return _bit_scan_forward(n); 902 903#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX 904 return (uint32_t)__builtin_ctz(n); 905 906#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 907 uint32_t i; 908 __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n)); 909 return i; 910 911#elif defined(_MSC_VER) 912 unsigned long i; 913 _BitScanForward(&i, n); 914 return i; 915 916#else 917 uint32_t i = 0; 918 919 if ((n & 0x0000FFFF) == 0) { 920 n >>= 16; 921 i = 16; 922 } 923 924 if ((n & 0x000000FF) == 0) { 925 n >>= 8; 926 i += 8; 927 } 928 929 if ((n & 0x0000000F) == 0) { 930 n >>= 4; 931 i += 4; 932 } 933 934 if ((n & 0x00000003) == 0) { 935 n >>= 2; 936 i += 2; 937 } 938 939 if ((n & 0x00000001) == 0) 940 ++i; 941 942 return i; 943#endif 944} 945 946#define bsf32 ctz32 947 948#endif 949