1285SN/A// SPDX-License-Identifier: 0BSD
2726Saefimov
3285SN/A///////////////////////////////////////////////////////////////////////////////
4285SN/A//
5285SN/A/// \file       tuklib_integer.h
6285SN/A/// \brief      Various integer and bit operations
7285SN/A///
8285SN/A/// This file provides macros or functions to do some basic integer and bit
9285SN/A/// operations.
10285SN/A///
11285SN/A/// Native endian inline functions (XX = 16, 32, or 64):
12285SN/A///   - Unaligned native endian reads: readXXne(ptr)
13285SN/A///   - Unaligned native endian writes: writeXXne(ptr, num)
14285SN/A///   - Aligned native endian reads: aligned_readXXne(ptr)
15285SN/A///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
16285SN/A///
17285SN/A/// Endianness-converting integer operations (these can be macros!)
18285SN/A/// (XX = 16, 32, or 64; Y = b or l):
19285SN/A///   - Byte swapping: byteswapXX(num)
20285SN/A///   - Byte order conversions to/from native (byteswaps if Y isn't
21285SN/A///     the native endianness): convXXYe(num)
22285SN/A///   - Unaligned reads: readXXYe(ptr)
23285SN/A///   - Unaligned writes: writeXXYe(ptr, num)
24285SN/A///   - Aligned reads: aligned_readXXYe(ptr)
25285SN/A///   - Aligned writes: aligned_writeXXYe(ptr, num)
26285SN/A///
27285SN/A/// Since the above can macros, the arguments should have no side effects
28285SN/A/// because they may be evaluated more than once.
29726Saefimov///
30726Saefimov/// Bit scan operations for non-zero 32-bit integers (inline functions):
31726Saefimov///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
32726Saefimov///   - Count leading zeros: clz32(num)
33726Saefimov///   - Count trailing zeros: ctz32(num)
34726Saefimov///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
35726Saefimov///
36726Saefimov/// The above bit scan operations return 0-31. If num is zero,
37285SN/A/// the result is undefined.
38285SN/A//
39285SN/A//  Authors:    Lasse Collin
40285SN/A//              Joachim Henke
41285SN/A//
42285SN/A///////////////////////////////////////////////////////////////////////////////
43285SN/A
44285SN/A#ifndef TUKLIB_INTEGER_H
45285SN/A#define TUKLIB_INTEGER_H
46285SN/A
47285SN/A#include "tuklib_common.h"
48285SN/A#include <string.h>
49285SN/A
50285SN/A// Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
51285SN/A// and such functions.
52285SN/A#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
53285SN/A#	include <immintrin.h>
54285SN/A// Only include <intrin.h> when it is needed. GCC and Clang can both
55285SN/A// use __builtin's, so we only need Windows instrincs when using MSVC.
56285SN/A// GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
57285SN/A// cases explicitly.
58285SN/A#elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
59285SN/A#	include <intrin.h>
60285SN/A#endif
61285SN/A
62285SN/A
63285SN/A///////////////////
64285SN/A// Byte swapping //
65285SN/A///////////////////
66285SN/A
67285SN/A#if defined(HAVE___BUILTIN_BSWAPXX)
68285SN/A	// GCC >= 4.8 and Clang
69285SN/A#	define byteswap16(num) __builtin_bswap16(num)
70285SN/A#	define byteswap32(num) __builtin_bswap32(num)
71285SN/A#	define byteswap64(num) __builtin_bswap64(num)
72285SN/A
73285SN/A#elif defined(HAVE_BYTESWAP_H)
74285SN/A	// glibc, uClibc, dietlibc
75285SN/A#	include <byteswap.h>
76285SN/A#	ifdef HAVE_BSWAP_16
77285SN/A#		define byteswap16(num) bswap_16(num)
78285SN/A#	endif
79396SN/A#	ifdef HAVE_BSWAP_32
80285SN/A#		define byteswap32(num) bswap_32(num)
81285SN/A#	endif
82285SN/A#	ifdef HAVE_BSWAP_64
83285SN/A#		define byteswap64(num) bswap_64(num)
84285SN/A#	endif
85285SN/A
86285SN/A#elif defined(HAVE_SYS_ENDIAN_H)
87285SN/A	// *BSDs and Darwin
88285SN/A#	include <sys/endian.h>
89285SN/A#	define byteswap16(num) bswap16(num)
90285SN/A#	define byteswap32(num) bswap32(num)
91285SN/A#	define byteswap64(num) bswap64(num)
92285SN/A
93285SN/A#elif defined(HAVE_SYS_BYTEORDER_H)
94285SN/A	// Solaris
95285SN/A#	include <sys/byteorder.h>
96285SN/A#	ifdef BSWAP_16
97285SN/A#		define byteswap16(num) BSWAP_16(num)
98285SN/A#	endif
99285SN/A#	ifdef BSWAP_32
100396SN/A#		define byteswap32(num) BSWAP_32(num)
101396SN/A#	endif
102396SN/A#	ifdef BSWAP_64
103396SN/A#		define byteswap64(num) BSWAP_64(num)
104396SN/A#	endif
105396SN/A#	ifdef BE_16
106396SN/A#		define conv16be(num) BE_16(num)
107396SN/A#	endif
108396SN/A#	ifdef BE_32
109396SN/A#		define conv32be(num) BE_32(num)
110396SN/A#	endif
111396SN/A#	ifdef BE_64
112396SN/A#		define conv64be(num) BE_64(num)
113396SN/A#	endif
114396SN/A#	ifdef LE_16
115396SN/A#		define conv16le(num) LE_16(num)
116396SN/A#	endif
117396SN/A#	ifdef LE_32
118396SN/A#		define conv32le(num) LE_32(num)
119396SN/A#	endif
120396SN/A#	ifdef LE_64
121396SN/A#		define conv64le(num) LE_64(num)
122396SN/A#	endif
123396SN/A#endif
124396SN/A
125396SN/A#ifndef byteswap16
126396SN/A#	define byteswap16(n) (uint16_t)( \
127396SN/A		  (((n) & 0x00FFU) << 8) \
128396SN/A		| (((n) & 0xFF00U) >> 8) \
129396SN/A	)
130396SN/A#endif
131396SN/A
132396SN/A#ifndef byteswap32
133396SN/A#	define byteswap32(n) (uint32_t)( \
134396SN/A		  (((n) & UINT32_C(0x000000FF)) << 24) \
135396SN/A		| (((n) & UINT32_C(0x0000FF00)) << 8) \
136396SN/A		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
137396SN/A		| (((n) & UINT32_C(0xFF000000)) >> 24) \
138396SN/A	)
139396SN/A#endif
140396SN/A
141396SN/A#ifndef byteswap64
142396SN/A#	define byteswap64(n) (uint64_t)( \
143396SN/A		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
144396SN/A		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
145396SN/A		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
146396SN/A		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
147396SN/A		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
148396SN/A		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
149396SN/A		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
150396SN/A		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
151285SN/A	)
152285SN/A#endif
153285SN/A
154285SN/A// Define conversion macros using the basic byte swapping macros.
155285SN/A#ifdef WORDS_BIGENDIAN
156285SN/A#	ifndef conv16be
157285SN/A#		define conv16be(num) ((uint16_t)(num))
158285SN/A#	endif
159285SN/A#	ifndef conv32be
160285SN/A#		define conv32be(num) ((uint32_t)(num))
161285SN/A#	endif
162285SN/A#	ifndef conv64be
163285SN/A#		define conv64be(num) ((uint64_t)(num))
164285SN/A#	endif
165396SN/A#	ifndef conv16le
166285SN/A#		define conv16le(num) byteswap16(num)
167396SN/A#	endif
168396SN/A#	ifndef conv32le
169285SN/A#		define conv32le(num) byteswap32(num)
170285SN/A#	endif
171396SN/A#	ifndef conv64le
172285SN/A#		define conv64le(num) byteswap64(num)
173396SN/A#	endif
174396SN/A#else
175396SN/A#	ifndef conv16be
176285SN/A#		define conv16be(num) byteswap16(num)
177285SN/A#	endif
178285SN/A#	ifndef conv32be
179285SN/A#		define conv32be(num) byteswap32(num)
180285SN/A#	endif
181285SN/A#	ifndef conv64be
182285SN/A#		define conv64be(num) byteswap64(num)
183285SN/A#	endif
184285SN/A#	ifndef conv16le
185396SN/A#		define conv16le(num) ((uint16_t)(num))
186285SN/A#	endif
187285SN/A#	ifndef conv32le
188285SN/A#		define conv32le(num) ((uint32_t)(num))
189285SN/A#	endif
190285SN/A#	ifndef conv64le
191285SN/A#		define conv64le(num) ((uint64_t)(num))
192285SN/A#	endif
193285SN/A#endif
194285SN/A
195285SN/A
196285SN/A////////////////////////////////
197285SN/A// Unaligned reads and writes //
198285SN/A////////////////////////////////
199285SN/A
200285SN/A// No-strict-align archs like x86-64
201285SN/A// ---------------------------------
202285SN/A//
203285SN/A// The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
204285SN/A// is bad even if the uint8_pointer is properly aligned because this kind
205396SN/A// of casts break strict aliasing rules and result in undefined behavior.
206285SN/A// With unaligned pointers it's even worse: compilers may emit vector
207396SN/A// instructions that require aligned pointers even if non-vector
208396SN/A// instructions work with unaligned pointers.
209396SN/A//
210285SN/A// Using memcpy() is the standard compliant way to do unaligned access.
211285SN/A// Many modern compilers inline it so there is no function call overhead.
212285SN/A// For those compilers that don't handle the memcpy() method well, the
213285SN/A// old casting method (that violates strict aliasing) can be requested at
214285SN/A// build time. A third method, casting to a packed struct, would also be
215285SN/A// an option but isn't provided to keep things simpler (it's already a mess).
216285SN/A// Hopefully this is flexible enough in practice.
217285SN/A//
218285SN/A// Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
219285SN/A//
220285SN/A//     buf[0] | (buf[1] << 8)
221285SN/A//
222285SN/A// reads a 16-bit value and can emit a single 16-bit load and produce
223285SN/A// identical code than with the memcpy() method. In other cases Clang and GCC
224285SN/A// produce either the same or better code with memcpy(). For example, Clang 9
225285SN/A// on x86-64 can detect 32-bit load but not 16-bit load.
226285SN/A//
227285SN/A// MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
228396SN/A// code for "buf[0] | (buf[1] << 8)".
229396SN/A//
230396SN/A// Conclusion: The memcpy() method is the best choice when unaligned access
231396SN/A// is supported.
232396SN/A//
233285SN/A// Strict-align archs like SPARC
234285SN/A// -----------------------------
235285SN/A//
236285SN/A// GCC versions from around 4.x to to at least 13.2.0 produce worse code
237285SN/A// from the memcpy() method than from simple byte-by-byte shift-or code
238285SN/A// when reading a 32-bit integer:
239285SN/A//
240285SN/A//     (1) It may be constructed on stack using using four 8-bit loads,
241285SN/A//         four 8-bit stores to stack, and finally one 32-bit load from stack.
242285SN/A//
243285SN/A//     (2) Especially with -Os, an actual memcpy() call may be emitted.
244285SN/A//
245285SN/A// This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
246285SN/A// RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
247396SN/A// some processors but not all so this is relevant only in the case when
248285SN/A// GCC assumes that unaligned is not supported or -mstrict-align or
249396SN/A// -mno-unaligned-access is used.
250396SN/A//
251285SN/A// For Clang it makes little difference. ARM64 with -O2 -mstrict-align
252285SN/A// was one the very few with a minor difference: the memcpy() version
253396SN/A// was one instruction longer.
254285SN/A//
255285SN/A// Conclusion: At least in case of GCC and Clang, byte-by-byte code is
256396SN/A// the best choice for strict-align archs to do unaligned access.
257285SN/A//
258285SN/A// See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
259285SN/A//
260285SN/A// Thanks to <https://godbolt.org/> it was easy to test different compilers.
261285SN/A// The following is for little endian targets:
262285SN/A/*
263396SN/A#include <stdint.h>
264285SN/A#include <string.h>
265396SN/A
266396SN/Auint32_t bytes16(const uint8_t *b)
267396SN/A{
268396SN/A    return (uint32_t)b[0]
269396SN/A        | ((uint32_t)b[1] << 8);
270396SN/A}
271396SN/A
272285SN/Auint32_t copy16(const uint8_t *b)
273285SN/A{
274285SN/A    uint16_t v;
275285SN/A    memcpy(&v, b, sizeof(v));
276285SN/A    return v;
277285SN/A}
278285SN/A
279285SN/Auint32_t bytes32(const uint8_t *b)
280285SN/A{
281285SN/A    return (uint32_t)b[0]
282285SN/A        | ((uint32_t)b[1] << 8)
283285SN/A        | ((uint32_t)b[2] << 16)
284285SN/A        | ((uint32_t)b[3] << 24);
285285SN/A}
286285SN/A
287285SN/Auint32_t copy32(const uint8_t *b)
288285SN/A{
289285SN/A    uint32_t v;
290285SN/A    memcpy(&v, b, sizeof(v));
291285SN/A    return v;
292285SN/A}
293285SN/A
294285SN/Avoid wbytes16(uint8_t *b, uint16_t v)
295285SN/A{
296396SN/A    b[0] = (uint8_t)v;
297396SN/A    b[1] = (uint8_t)(v >> 8);
298396SN/A}
299396SN/A
300396SN/Avoid wcopy16(uint8_t *b, uint16_t v)
301285SN/A{
302285SN/A    memcpy(b, &v, sizeof(v));
303285SN/A}
304285SN/A
305285SN/Avoid wbytes32(uint8_t *b, uint32_t v)
306285SN/A{
307285SN/A    b[0] = (uint8_t)v;
308285SN/A    b[1] = (uint8_t)(v >> 8);
309285SN/A    b[2] = (uint8_t)(v >> 16);
310285SN/A    b[3] = (uint8_t)(v >> 24);
311285SN/A}
312285SN/A
313285SN/Avoid wcopy32(uint8_t *b, uint32_t v)
314285SN/A{
315285SN/A    memcpy(b, &v, sizeof(v));
316285SN/A}
317285SN/A*/
318285SN/A
319285SN/A
320285SN/A#ifdef TUKLIB_FAST_UNALIGNED_ACCESS
321285SN/A
322285SN/Astatic inline uint16_t
323285SN/Aread16ne(const uint8_t *buf)
324285SN/A{
325285SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
326285SN/A	return *(const uint16_t *)buf;
327285SN/A#else
328285SN/A	uint16_t num;
329285SN/A	memcpy(&num, buf, sizeof(num));
330285SN/A	return num;
331285SN/A#endif
332285SN/A}
333285SN/A
334285SN/A
335285SN/Astatic inline uint32_t
336285SN/Aread32ne(const uint8_t *buf)
337285SN/A{
338285SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
339285SN/A	return *(const uint32_t *)buf;
340285SN/A#else
341396SN/A	uint32_t num;
342396SN/A	memcpy(&num, buf, sizeof(num));
343396SN/A	return num;
344396SN/A#endif
345396SN/A}
346396SN/A
347396SN/A
348396SN/Astatic inline uint64_t
349396SN/Aread64ne(const uint8_t *buf)
350396SN/A{
351396SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
352396SN/A	return *(const uint64_t *)buf;
353396SN/A#else
354396SN/A	uint64_t num;
355396SN/A	memcpy(&num, buf, sizeof(num));
356396SN/A	return num;
357396SN/A#endif
358396SN/A}
359396SN/A
360396SN/A
361396SN/Astatic inline void
362396SN/Awrite16ne(uint8_t *buf, uint16_t num)
363396SN/A{
364396SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
365396SN/A	*(uint16_t *)buf = num;
366396SN/A#else
367396SN/A	memcpy(buf, &num, sizeof(num));
368285SN/A#endif
369285SN/A	return;
370285SN/A}
371285SN/A
372285SN/A
373285SN/Astatic inline void
374285SN/Awrite32ne(uint8_t *buf, uint32_t num)
375285SN/A{
376285SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
377285SN/A	*(uint32_t *)buf = num;
378285SN/A#else
379396SN/A	memcpy(buf, &num, sizeof(num));
380285SN/A#endif
381396SN/A	return;
382396SN/A}
383285SN/A
384285SN/A
385285SN/Astatic inline void
386285SN/Awrite64ne(uint8_t *buf, uint64_t num)
387285SN/A{
388285SN/A#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
389285SN/A	*(uint64_t *)buf = num;
390285SN/A#else
391396SN/A	memcpy(buf, &num, sizeof(num));
392285SN/A#endif
393285SN/A	return;
394396SN/A}
395396SN/A
396285SN/A
397285SN/Astatic inline uint16_t
398285SN/Aread16be(const uint8_t *buf)
399396SN/A{
400285SN/A	uint16_t num = read16ne(buf);
401396SN/A	return conv16be(num);
402396SN/A}
403396SN/A
404396SN/A
405285SN/Astatic inline uint16_t
406285SN/Aread16le(const uint8_t *buf)
407285SN/A{
408285SN/A	uint16_t num = read16ne(buf);
409285SN/A	return conv16le(num);
410285SN/A}
411285SN/A
412285SN/A
413285SN/Astatic inline uint32_t
414285SN/Aread32be(const uint8_t *buf)
415285SN/A{
416285SN/A	uint32_t num = read32ne(buf);
417285SN/A	return conv32be(num);
418285SN/A}
419285SN/A
420285SN/A
421285SN/Astatic inline uint32_t
422285SN/Aread32le(const uint8_t *buf)
423285SN/A{
424285SN/A	uint32_t num = read32ne(buf);
425285SN/A	return conv32le(num);
426285SN/A}
427285SN/A
428285SN/A
429285SN/Astatic inline uint64_t
430285SN/Aread64be(const uint8_t *buf)
431396SN/A{
432396SN/A	uint64_t num = read64ne(buf);
433726Saefimov	return conv64be(num);
434396SN/A}
435396SN/A
436396SN/A
437396SN/Astatic inline uint64_t
438396SN/Aread64le(const uint8_t *buf)
439396SN/A{
440396SN/A	uint64_t num = read64ne(buf);
441396SN/A	return conv64le(num);
442396SN/A}
443396SN/A
444396SN/A
445396SN/A// NOTE: Possible byte swapping must be done in a macro to allow the compiler
446396SN/A// to optimize byte swapping of constants when using glibc's or *BSD's
447396SN/A// byte swapping macros. The actual write is done in an inline function
448285SN/A// to make type checking of the buf pointer possible.
449285SN/A#define write16be(buf, num) write16ne(buf, conv16be(num))
450285SN/A#define write32be(buf, num) write32ne(buf, conv32be(num))
451285SN/A#define write64be(buf, num) write64ne(buf, conv64be(num))
452285SN/A#define write16le(buf, num) write16ne(buf, conv16le(num))
453396SN/A#define write32le(buf, num) write32ne(buf, conv32le(num))
454396SN/A#define write64le(buf, num) write64ne(buf, conv64le(num))
455396SN/A
456396SN/A#else
457396SN/A
458396SN/A#ifdef WORDS_BIGENDIAN
459396SN/A#	define read16ne read16be
460285SN/A#	define read32ne read32be
461285SN/A#	define read64ne read64be
462285SN/A#	define write16ne write16be
463285SN/A#	define write32ne write32be
464285SN/A#	define write64ne write64be
465285SN/A#else
466285SN/A#	define read16ne read16le
467285SN/A#	define read32ne read32le
468285SN/A#	define read64ne read64le
469285SN/A#	define write16ne write16le
470285SN/A#	define write32ne write32le
471285SN/A#	define write64ne write64le
472285SN/A#endif
473285SN/A
474285SN/A
475285SN/Astatic inline uint16_t
476285SN/Aread16be(const uint8_t *buf)
477285SN/A{
478285SN/A	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
479	return num;
480}
481
482
483static inline uint16_t
484read16le(const uint8_t *buf)
485{
486	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
487	return num;
488}
489
490
491static inline uint32_t
492read32be(const uint8_t *buf)
493{
494	uint32_t num = (uint32_t)buf[0] << 24;
495	num |= (uint32_t)buf[1] << 16;
496	num |= (uint32_t)buf[2] << 8;
497	num |= (uint32_t)buf[3];
498	return num;
499}
500
501
502static inline uint32_t
503read32le(const uint8_t *buf)
504{
505	uint32_t num = (uint32_t)buf[0];
506	num |= (uint32_t)buf[1] << 8;
507	num |= (uint32_t)buf[2] << 16;
508	num |= (uint32_t)buf[3] << 24;
509	return num;
510}
511
512
513static inline uint64_t
514read64be(const uint8_t *buf)
515{
516	uint64_t num = (uint64_t)buf[0] << 56;
517	num |= (uint64_t)buf[1] << 48;
518	num |= (uint64_t)buf[2] << 40;
519	num |= (uint64_t)buf[3] << 32;
520	num |= (uint64_t)buf[4] << 24;
521	num |= (uint64_t)buf[5] << 16;
522	num |= (uint64_t)buf[6] << 8;
523	num |= (uint64_t)buf[7];
524	return num;
525}
526
527
528static inline uint64_t
529read64le(const uint8_t *buf)
530{
531	uint64_t num = (uint64_t)buf[0];
532	num |= (uint64_t)buf[1] << 8;
533	num |= (uint64_t)buf[2] << 16;
534	num |= (uint64_t)buf[3] << 24;
535	num |= (uint64_t)buf[4] << 32;
536	num |= (uint64_t)buf[5] << 40;
537	num |= (uint64_t)buf[6] << 48;
538	num |= (uint64_t)buf[7] << 56;
539	return num;
540}
541
542
543static inline void
544write16be(uint8_t *buf, uint16_t num)
545{
546	buf[0] = (uint8_t)(num >> 8);
547	buf[1] = (uint8_t)num;
548	return;
549}
550
551
552static inline void
553write16le(uint8_t *buf, uint16_t num)
554{
555	buf[0] = (uint8_t)num;
556	buf[1] = (uint8_t)(num >> 8);
557	return;
558}
559
560
561static inline void
562write32be(uint8_t *buf, uint32_t num)
563{
564	buf[0] = (uint8_t)(num >> 24);
565	buf[1] = (uint8_t)(num >> 16);
566	buf[2] = (uint8_t)(num >> 8);
567	buf[3] = (uint8_t)num;
568	return;
569}
570
571
572static inline void
573write32le(uint8_t *buf, uint32_t num)
574{
575	buf[0] = (uint8_t)num;
576	buf[1] = (uint8_t)(num >> 8);
577	buf[2] = (uint8_t)(num >> 16);
578	buf[3] = (uint8_t)(num >> 24);
579	return;
580}
581
582
583static inline void
584write64be(uint8_t *buf, uint64_t num)
585{
586	buf[0] = (uint8_t)(num >> 56);
587	buf[1] = (uint8_t)(num >> 48);
588	buf[2] = (uint8_t)(num >> 40);
589	buf[3] = (uint8_t)(num >> 32);
590	buf[4] = (uint8_t)(num >> 24);
591	buf[5] = (uint8_t)(num >> 16);
592	buf[6] = (uint8_t)(num >> 8);
593	buf[7] = (uint8_t)num;
594	return;
595}
596
597
598static inline void
599write64le(uint8_t *buf, uint64_t num)
600{
601	buf[0] = (uint8_t)num;
602	buf[1] = (uint8_t)(num >> 8);
603	buf[2] = (uint8_t)(num >> 16);
604	buf[3] = (uint8_t)(num >> 24);
605	buf[4] = (uint8_t)(num >> 32);
606	buf[5] = (uint8_t)(num >> 40);
607	buf[6] = (uint8_t)(num >> 48);
608	buf[7] = (uint8_t)(num >> 56);
609	return;
610}
611
612#endif
613
614
615//////////////////////////////
616// Aligned reads and writes //
617//////////////////////////////
618
619// Separate functions for aligned reads and writes are provided since on
620// strict-align archs aligned access is much faster than unaligned access.
621//
622// Just like in the unaligned case, memcpy() is needed to avoid
623// strict aliasing violations. However, on archs that don't support
624// unaligned access the compiler cannot know that the pointers given
625// to memcpy() are aligned which results in slow code. As of C11 there is
626// no standard way to tell the compiler that we know that the address is
627// aligned but some compilers have language extensions to do that. With
628// such language extensions the memcpy() method gives excellent results.
629//
630// What to do on a strict-align system when no known language extensions
631// are available? Falling back to byte-by-byte access would be safe but ruin
632// optimizations that have been made specifically with aligned access in mind.
633// As a compromise, aligned reads will fall back to non-compliant type punning
634// but aligned writes will be byte-by-byte, that is, fast reads are preferred
635// over fast writes. This obviously isn't great but hopefully it's a working
636// compromise for now.
637//
638// __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
639#ifdef HAVE___BUILTIN_ASSUME_ALIGNED
640#	define tuklib_memcpy_aligned(dest, src, size) \
641		memcpy(dest, __builtin_assume_aligned(src, size), size)
642#else
643#	define tuklib_memcpy_aligned(dest, src, size) \
644		memcpy(dest, src, size)
645#	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
646#		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
647#	endif
648#endif
649
650
651static inline uint16_t
652aligned_read16ne(const uint8_t *buf)
653{
654#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
655		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
656	return *(const uint16_t *)buf;
657#else
658	uint16_t num;
659	tuklib_memcpy_aligned(&num, buf, sizeof(num));
660	return num;
661#endif
662}
663
664
665static inline uint32_t
666aligned_read32ne(const uint8_t *buf)
667{
668#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
669		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
670	return *(const uint32_t *)buf;
671#else
672	uint32_t num;
673	tuklib_memcpy_aligned(&num, buf, sizeof(num));
674	return num;
675#endif
676}
677
678
679static inline uint64_t
680aligned_read64ne(const uint8_t *buf)
681{
682#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
683		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
684	return *(const uint64_t *)buf;
685#else
686	uint64_t num;
687	tuklib_memcpy_aligned(&num, buf, sizeof(num));
688	return num;
689#endif
690}
691
692
693static inline void
694aligned_write16ne(uint8_t *buf, uint16_t num)
695{
696#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
697	*(uint16_t *)buf = num;
698#else
699	tuklib_memcpy_aligned(buf, &num, sizeof(num));
700#endif
701	return;
702}
703
704
705static inline void
706aligned_write32ne(uint8_t *buf, uint32_t num)
707{
708#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
709	*(uint32_t *)buf = num;
710#else
711	tuklib_memcpy_aligned(buf, &num, sizeof(num));
712#endif
713	return;
714}
715
716
717static inline void
718aligned_write64ne(uint8_t *buf, uint64_t num)
719{
720#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
721	*(uint64_t *)buf = num;
722#else
723	tuklib_memcpy_aligned(buf, &num, sizeof(num));
724#endif
725	return;
726}
727
728
729static inline uint16_t
730aligned_read16be(const uint8_t *buf)
731{
732	uint16_t num = aligned_read16ne(buf);
733	return conv16be(num);
734}
735
736
737static inline uint16_t
738aligned_read16le(const uint8_t *buf)
739{
740	uint16_t num = aligned_read16ne(buf);
741	return conv16le(num);
742}
743
744
745static inline uint32_t
746aligned_read32be(const uint8_t *buf)
747{
748	uint32_t num = aligned_read32ne(buf);
749	return conv32be(num);
750}
751
752
753static inline uint32_t
754aligned_read32le(const uint8_t *buf)
755{
756	uint32_t num = aligned_read32ne(buf);
757	return conv32le(num);
758}
759
760
761static inline uint64_t
762aligned_read64be(const uint8_t *buf)
763{
764	uint64_t num = aligned_read64ne(buf);
765	return conv64be(num);
766}
767
768
769static inline uint64_t
770aligned_read64le(const uint8_t *buf)
771{
772	uint64_t num = aligned_read64ne(buf);
773	return conv64le(num);
774}
775
776
777// These need to be macros like in the unaligned case.
778#define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
779#define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
780#define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
781#define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
782#define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
783#define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
784
785
786////////////////////
787// Bit operations //
788////////////////////
789
790static inline uint32_t
791bsr32(uint32_t n)
792{
793	// Check for ICC first, since it tends to define __GNUC__ too.
794#if defined(__INTEL_COMPILER)
795	return _bit_scan_reverse(n);
796
797#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
798	// GCC >= 3.4 has __builtin_clz(), which gives good results on
799	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
800	// either plain BSR (so the XOR gets optimized away) or LZCNT and
801	// XOR (if -march indicates that SSE4a instructions are supported).
802	return (uint32_t)__builtin_clz(n) ^ 31U;
803
804#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
805	uint32_t i;
806	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
807	return i;
808
809#elif defined(_MSC_VER)
810	unsigned long i;
811	_BitScanReverse(&i, n);
812	return i;
813
814#else
815	uint32_t i = 31;
816
817	if ((n & 0xFFFF0000) == 0) {
818		n <<= 16;
819		i = 15;
820	}
821
822	if ((n & 0xFF000000) == 0) {
823		n <<= 8;
824		i -= 8;
825	}
826
827	if ((n & 0xF0000000) == 0) {
828		n <<= 4;
829		i -= 4;
830	}
831
832	if ((n & 0xC0000000) == 0) {
833		n <<= 2;
834		i -= 2;
835	}
836
837	if ((n & 0x80000000) == 0)
838		--i;
839
840	return i;
841#endif
842}
843
844
845static inline uint32_t
846clz32(uint32_t n)
847{
848#if defined(__INTEL_COMPILER)
849	return _bit_scan_reverse(n) ^ 31U;
850
851#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
852	return (uint32_t)__builtin_clz(n);
853
854#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
855	uint32_t i;
856	__asm__("bsrl %1, %0\n\t"
857		"xorl $31, %0"
858		: "=r" (i) : "rm" (n));
859	return i;
860
861#elif defined(_MSC_VER)
862	unsigned long i;
863	_BitScanReverse(&i, n);
864	return i ^ 31U;
865
866#else
867	uint32_t i = 0;
868
869	if ((n & 0xFFFF0000) == 0) {
870		n <<= 16;
871		i = 16;
872	}
873
874	if ((n & 0xFF000000) == 0) {
875		n <<= 8;
876		i += 8;
877	}
878
879	if ((n & 0xF0000000) == 0) {
880		n <<= 4;
881		i += 4;
882	}
883
884	if ((n & 0xC0000000) == 0) {
885		n <<= 2;
886		i += 2;
887	}
888
889	if ((n & 0x80000000) == 0)
890		++i;
891
892	return i;
893#endif
894}
895
896
897static inline uint32_t
898ctz32(uint32_t n)
899{
900#if defined(__INTEL_COMPILER)
901	return _bit_scan_forward(n);
902
903#elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
904	return (uint32_t)__builtin_ctz(n);
905
906#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
907	uint32_t i;
908	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
909	return i;
910
911#elif defined(_MSC_VER)
912	unsigned long i;
913	_BitScanForward(&i, n);
914	return i;
915
916#else
917	uint32_t i = 0;
918
919	if ((n & 0x0000FFFF) == 0) {
920		n >>= 16;
921		i = 16;
922	}
923
924	if ((n & 0x000000FF) == 0) {
925		n >>= 8;
926		i += 8;
927	}
928
929	if ((n & 0x0000000F) == 0) {
930		n >>= 4;
931		i += 4;
932	}
933
934	if ((n & 0x00000003) == 0) {
935		n >>= 2;
936		i += 2;
937	}
938
939	if ((n & 0x00000001) == 0)
940		++i;
941
942	return i;
943#endif
944}
945
946#define bsf32 ctz32
947
948#endif
949