1207753Smm///////////////////////////////////////////////////////////////////////////////
2207753Smm//
3207753Smm/// \file       tuklib_integer.h
4207753Smm/// \brief      Various integer and bit operations
5207753Smm///
6207753Smm/// This file provides macros or functions to do some basic integer and bit
7207753Smm/// operations.
8207753Smm///
9360523Sdelphij/// Native endian inline functions (XX = 16, 32, or 64):
10360523Sdelphij///   - Unaligned native endian reads: readXXne(ptr)
11360523Sdelphij///   - Unaligned native endian writes: writeXXne(ptr, num)
12360523Sdelphij///   - Aligned native endian reads: aligned_readXXne(ptr)
13360523Sdelphij///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
14360523Sdelphij///
15360523Sdelphij/// Endianness-converting integer operations (these can be macros!)
16360523Sdelphij/// (XX = 16, 32, or 64; Y = b or l):
17207753Smm///   - Byte swapping: bswapXX(num)
18360523Sdelphij///   - Byte order conversions to/from native (byteswaps if Y isn't
19360523Sdelphij///     the native endianness): convXXYe(num)
20360523Sdelphij///   - Unaligned reads (16/32-bit only): readXXYe(ptr)
21360523Sdelphij///   - Unaligned writes (16/32-bit only): writeXXYe(ptr, num)
22360523Sdelphij///   - Aligned reads: aligned_readXXYe(ptr)
23360523Sdelphij///   - Aligned writes: aligned_writeXXYe(ptr, num)
24207753Smm///
25360523Sdelphij/// Since the above can macros, the arguments should have no side effects
26360523Sdelphij/// because they may be evaluated more than once.
27207753Smm///
28360523Sdelphij/// Bit scan operations for non-zero 32-bit integers (inline functions):
29207753Smm///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
30207753Smm///   - Count leading zeros: clz32(num)
31207753Smm///   - Count trailing zeros: ctz32(num)
32207753Smm///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
33207753Smm///
34207753Smm/// The above bit scan operations return 0-31. If num is zero,
35207753Smm/// the result is undefined.
36207753Smm//
37207753Smm//  Authors:    Lasse Collin
38207753Smm//              Joachim Henke
39207753Smm//
40207753Smm//  This file has been put into the public domain.
41207753Smm//  You can do whatever you want with this file.
42207753Smm//
43207753Smm///////////////////////////////////////////////////////////////////////////////
44207753Smm
45207753Smm#ifndef TUKLIB_INTEGER_H
46207753Smm#define TUKLIB_INTEGER_H
47207753Smm
48207753Smm#include "tuklib_common.h"
49360523Sdelphij#include <string.h>
50207753Smm
51360523Sdelphij// Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
52360523Sdelphij// and such functions.
53360523Sdelphij#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
54360523Sdelphij#	include <immintrin.h>
55360523Sdelphij#endif
56207753Smm
57207753Smm
58360523Sdelphij///////////////////
59360523Sdelphij// Byte swapping //
60360523Sdelphij///////////////////
61360523Sdelphij
62360523Sdelphij#if defined(HAVE___BUILTIN_BSWAPXX)
63360523Sdelphij	// GCC >= 4.8 and Clang
64360523Sdelphij#	define bswap16(n) __builtin_bswap16(n)
65360523Sdelphij#	define bswap32(n) __builtin_bswap32(n)
66360523Sdelphij#	define bswap64(n) __builtin_bswap64(n)
67360523Sdelphij
68360523Sdelphij#elif defined(HAVE_BYTESWAP_H)
69207753Smm	// glibc, uClibc, dietlibc
70207753Smm#	include <byteswap.h>
71207753Smm#	ifdef HAVE_BSWAP_16
72207753Smm#		define bswap16(num) bswap_16(num)
73207753Smm#	endif
74207753Smm#	ifdef HAVE_BSWAP_32
75207753Smm#		define bswap32(num) bswap_32(num)
76207753Smm#	endif
77207753Smm#	ifdef HAVE_BSWAP_64
78207753Smm#		define bswap64(num) bswap_64(num)
79207753Smm#	endif
80207753Smm
81207753Smm#elif defined(HAVE_SYS_ENDIAN_H)
82207753Smm	// *BSDs and Darwin
83207753Smm#	include <sys/endian.h>
84207753Smm
85207753Smm#elif defined(HAVE_SYS_BYTEORDER_H)
86207753Smm	// Solaris
87207753Smm#	include <sys/byteorder.h>
88207753Smm#	ifdef BSWAP_16
89207753Smm#		define bswap16(num) BSWAP_16(num)
90207753Smm#	endif
91207753Smm#	ifdef BSWAP_32
92207753Smm#		define bswap32(num) BSWAP_32(num)
93207753Smm#	endif
94207753Smm#	ifdef BSWAP_64
95207753Smm#		define bswap64(num) BSWAP_64(num)
96207753Smm#	endif
97207753Smm#	ifdef BE_16
98207753Smm#		define conv16be(num) BE_16(num)
99207753Smm#	endif
100207753Smm#	ifdef BE_32
101207753Smm#		define conv32be(num) BE_32(num)
102207753Smm#	endif
103207753Smm#	ifdef BE_64
104207753Smm#		define conv64be(num) BE_64(num)
105207753Smm#	endif
106207753Smm#	ifdef LE_16
107207753Smm#		define conv16le(num) LE_16(num)
108207753Smm#	endif
109207753Smm#	ifdef LE_32
110207753Smm#		define conv32le(num) LE_32(num)
111207753Smm#	endif
112207753Smm#	ifdef LE_64
113207753Smm#		define conv64le(num) LE_64(num)
114207753Smm#	endif
115207753Smm#endif
116207753Smm
117207753Smm#ifndef bswap16
118360523Sdelphij#	define bswap16(n) (uint16_t)( \
119360523Sdelphij		  (((n) & 0x00FFU) << 8) \
120360523Sdelphij		| (((n) & 0xFF00U) >> 8) \
121360523Sdelphij	)
122207753Smm#endif
123207753Smm
124207753Smm#ifndef bswap32
125360523Sdelphij#	define bswap32(n) (uint32_t)( \
126360523Sdelphij		  (((n) & UINT32_C(0x000000FF)) << 24) \
127360523Sdelphij		| (((n) & UINT32_C(0x0000FF00)) << 8) \
128360523Sdelphij		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
129360523Sdelphij		| (((n) & UINT32_C(0xFF000000)) >> 24) \
130360523Sdelphij	)
131207753Smm#endif
132207753Smm
133207753Smm#ifndef bswap64
134360523Sdelphij#	define bswap64(n) (uint64_t)( \
135360523Sdelphij		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
136360523Sdelphij		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
137360523Sdelphij		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
138360523Sdelphij		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
139360523Sdelphij		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
140360523Sdelphij		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
141360523Sdelphij		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
142360523Sdelphij		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
143360523Sdelphij	)
144207753Smm#endif
145207753Smm
146207753Smm// Define conversion macros using the basic byte swapping macros.
147207753Smm#ifdef WORDS_BIGENDIAN
148207753Smm#	ifndef conv16be
149207753Smm#		define conv16be(num) ((uint16_t)(num))
150207753Smm#	endif
151207753Smm#	ifndef conv32be
152207753Smm#		define conv32be(num) ((uint32_t)(num))
153207753Smm#	endif
154207753Smm#	ifndef conv64be
155207753Smm#		define conv64be(num) ((uint64_t)(num))
156207753Smm#	endif
157207753Smm#	ifndef conv16le
158207753Smm#		define conv16le(num) bswap16(num)
159207753Smm#	endif
160207753Smm#	ifndef conv32le
161207753Smm#		define conv32le(num) bswap32(num)
162207753Smm#	endif
163207753Smm#	ifndef conv64le
164207753Smm#		define conv64le(num) bswap64(num)
165207753Smm#	endif
166207753Smm#else
167207753Smm#	ifndef conv16be
168207753Smm#		define conv16be(num) bswap16(num)
169207753Smm#	endif
170207753Smm#	ifndef conv32be
171207753Smm#		define conv32be(num) bswap32(num)
172207753Smm#	endif
173207753Smm#	ifndef conv64be
174207753Smm#		define conv64be(num) bswap64(num)
175207753Smm#	endif
176207753Smm#	ifndef conv16le
177207753Smm#		define conv16le(num) ((uint16_t)(num))
178207753Smm#	endif
179207753Smm#	ifndef conv32le
180207753Smm#		define conv32le(num) ((uint32_t)(num))
181207753Smm#	endif
182207753Smm#	ifndef conv64le
183207753Smm#		define conv64le(num) ((uint64_t)(num))
184207753Smm#	endif
185207753Smm#endif
186207753Smm
187207753Smm
188360523Sdelphij////////////////////////////////
189360523Sdelphij// Unaligned reads and writes //
190360523Sdelphij////////////////////////////////
191207753Smm
192360523Sdelphij// The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
193360523Sdelphij// is bad even if the uint8_pointer is properly aligned because this kind
194360523Sdelphij// of casts break strict aliasing rules and result in undefined behavior.
195360523Sdelphij// With unaligned pointers it's even worse: compilers may emit vector
196360523Sdelphij// instructions that require aligned pointers even if non-vector
197360523Sdelphij// instructions work with unaligned pointers.
198360523Sdelphij//
199360523Sdelphij// Using memcpy() is the standard compliant way to do unaligned access.
200360523Sdelphij// Many modern compilers inline it so there is no function call overhead.
201360523Sdelphij// For those compilers that don't handle the memcpy() method well, the
202360523Sdelphij// old casting method (that violates strict aliasing) can be requested at
203360523Sdelphij// build time. A third method, casting to a packed struct, would also be
204360523Sdelphij// an option but isn't provided to keep things simpler (it's already a mess).
205360523Sdelphij// Hopefully this is flexible enough in practice.
206207753Smm
207207753Smmstatic inline uint16_t
208360523Sdelphijread16ne(const uint8_t *buf)
209207753Smm{
210360523Sdelphij#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
211360523Sdelphij		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
212360523Sdelphij	return *(const uint16_t *)buf;
213360523Sdelphij#else
214360523Sdelphij	uint16_t num;
215360523Sdelphij	memcpy(&num, buf, sizeof(num));
216360523Sdelphij	return num;
217360523Sdelphij#endif
218207753Smm}
219207753Smm
220207753Smm
221207753Smmstatic inline uint32_t
222360523Sdelphijread32ne(const uint8_t *buf)
223207753Smm{
224360523Sdelphij#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
225360523Sdelphij		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
226360523Sdelphij	return *(const uint32_t *)buf;
227360523Sdelphij#else
228360523Sdelphij	uint32_t num;
229360523Sdelphij	memcpy(&num, buf, sizeof(num));
230360523Sdelphij	return num;
231360523Sdelphij#endif
232207753Smm}
233207753Smm
234207753Smm
235207753Smmstatic inline uint64_t
236360523Sdelphijread64ne(const uint8_t *buf)
237207753Smm{
238360523Sdelphij#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
239360523Sdelphij		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
240360523Sdelphij	return *(const uint64_t *)buf;
241360523Sdelphij#else
242360523Sdelphij	uint64_t num;
243360523Sdelphij	memcpy(&num, buf, sizeof(num));
244360523Sdelphij	return num;
245360523Sdelphij#endif
246207753Smm}
247207753Smm
248207753Smm
249207753Smmstatic inline void
250207753Smmwrite16ne(uint8_t *buf, uint16_t num)
251207753Smm{
252360523Sdelphij#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
253360523Sdelphij		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
254207753Smm	*(uint16_t *)buf = num;
255360523Sdelphij#else
256360523Sdelphij	memcpy(buf, &num, sizeof(num));
257360523Sdelphij#endif
258207753Smm	return;
259207753Smm}
260207753Smm
261207753Smm
262207753Smmstatic inline void
263207753Smmwrite32ne(uint8_t *buf, uint32_t num)
264207753Smm{
265360523Sdelphij#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
266360523Sdelphij		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
267207753Smm	*(uint32_t *)buf = num;
268360523Sdelphij#else
269360523Sdelphij	memcpy(buf, &num, sizeof(num));
270360523Sdelphij#endif
271207753Smm	return;
272207753Smm}
273207753Smm
274207753Smm
275207753Smmstatic inline void
276207753Smmwrite64ne(uint8_t *buf, uint64_t num)
277207753Smm{
278360523Sdelphij#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \
279360523Sdelphij		&& defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING)
280207753Smm	*(uint64_t *)buf = num;
281360523Sdelphij#else
282360523Sdelphij	memcpy(buf, &num, sizeof(num));
283360523Sdelphij#endif
284207753Smm	return;
285207753Smm}
286207753Smm
287207753Smm
288207753Smmstatic inline uint16_t
289360523Sdelphijread16be(const uint8_t *buf)
290207753Smm{
291360523Sdelphij#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
292360523Sdelphij	uint16_t num = read16ne(buf);
293360523Sdelphij	return conv16be(num);
294360523Sdelphij#else
295207753Smm	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
296207753Smm	return num;
297360523Sdelphij#endif
298207753Smm}
299207753Smm
300207753Smm
301207753Smmstatic inline uint16_t
302360523Sdelphijread16le(const uint8_t *buf)
303207753Smm{
304360523Sdelphij#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
305360523Sdelphij	uint16_t num = read16ne(buf);
306360523Sdelphij	return conv16le(num);
307360523Sdelphij#else
308207753Smm	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
309207753Smm	return num;
310360523Sdelphij#endif
311207753Smm}
312207753Smm
313207753Smm
314207753Smmstatic inline uint32_t
315360523Sdelphijread32be(const uint8_t *buf)
316207753Smm{
317360523Sdelphij#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
318360523Sdelphij	uint32_t num = read32ne(buf);
319360523Sdelphij	return conv32be(num);
320360523Sdelphij#else
321207753Smm	uint32_t num = (uint32_t)buf[0] << 24;
322207753Smm	num |= (uint32_t)buf[1] << 16;
323207753Smm	num |= (uint32_t)buf[2] << 8;
324207753Smm	num |= (uint32_t)buf[3];
325207753Smm	return num;
326360523Sdelphij#endif
327207753Smm}
328207753Smm
329207753Smm
330207753Smmstatic inline uint32_t
331360523Sdelphijread32le(const uint8_t *buf)
332207753Smm{
333360523Sdelphij#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
334360523Sdelphij	uint32_t num = read32ne(buf);
335360523Sdelphij	return conv32le(num);
336360523Sdelphij#else
337207753Smm	uint32_t num = (uint32_t)buf[0];
338207753Smm	num |= (uint32_t)buf[1] << 8;
339207753Smm	num |= (uint32_t)buf[2] << 16;
340207753Smm	num |= (uint32_t)buf[3] << 24;
341207753Smm	return num;
342360523Sdelphij#endif
343207753Smm}
344207753Smm
345207753Smm
346360523Sdelphij// NOTE: Possible byte swapping must be done in a macro to allow the compiler
347360523Sdelphij// to optimize byte swapping of constants when using glibc's or *BSD's
348360523Sdelphij// byte swapping macros. The actual write is done in an inline function
349360523Sdelphij// to make type checking of the buf pointer possible.
350360523Sdelphij#if defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
351360523Sdelphij#	define write16be(buf, num) write16ne(buf, conv16be(num))
352360523Sdelphij#	define write32be(buf, num) write32ne(buf, conv32be(num))
353360523Sdelphij#endif
354360523Sdelphij
355360523Sdelphij#if !defined(WORDS_BIGENDIAN) || defined(TUKLIB_FAST_UNALIGNED_ACCESS)
356360523Sdelphij#	define write16le(buf, num) write16ne(buf, conv16le(num))
357360523Sdelphij#	define write32le(buf, num) write32ne(buf, conv32le(num))
358360523Sdelphij#endif
359360523Sdelphij
360360523Sdelphij
361360523Sdelphij#ifndef write16be
362207753Smmstatic inline void
363360523Sdelphijwrite16be(uint8_t *buf, uint16_t num)
364207753Smm{
365281316Sdelphij	buf[0] = (uint8_t)(num >> 8);
366281316Sdelphij	buf[1] = (uint8_t)num;
367207753Smm	return;
368207753Smm}
369360523Sdelphij#endif
370207753Smm
371207753Smm
372360523Sdelphij#ifndef write16le
373207753Smmstatic inline void
374360523Sdelphijwrite16le(uint8_t *buf, uint16_t num)
375207753Smm{
376281316Sdelphij	buf[0] = (uint8_t)num;
377281316Sdelphij	buf[1] = (uint8_t)(num >> 8);
378207753Smm	return;
379207753Smm}
380360523Sdelphij#endif
381207753Smm
382207753Smm
383360523Sdelphij#ifndef write32be
384207753Smmstatic inline void
385360523Sdelphijwrite32be(uint8_t *buf, uint32_t num)
386207753Smm{
387281316Sdelphij	buf[0] = (uint8_t)(num >> 24);
388281316Sdelphij	buf[1] = (uint8_t)(num >> 16);
389281316Sdelphij	buf[2] = (uint8_t)(num >> 8);
390281316Sdelphij	buf[3] = (uint8_t)num;
391207753Smm	return;
392207753Smm}
393360523Sdelphij#endif
394207753Smm
395207753Smm
396360523Sdelphij#ifndef write32le
397207753Smmstatic inline void
398360523Sdelphijwrite32le(uint8_t *buf, uint32_t num)
399207753Smm{
400281316Sdelphij	buf[0] = (uint8_t)num;
401281316Sdelphij	buf[1] = (uint8_t)(num >> 8);
402281316Sdelphij	buf[2] = (uint8_t)(num >> 16);
403281316Sdelphij	buf[3] = (uint8_t)(num >> 24);
404207753Smm	return;
405207753Smm}
406360523Sdelphij#endif
407207753Smm
408360523Sdelphij
409360523Sdelphij//////////////////////////////
410360523Sdelphij// Aligned reads and writes //
411360523Sdelphij//////////////////////////////
412360523Sdelphij
413360523Sdelphij// Separate functions for aligned reads and writes are provided since on
414360523Sdelphij// strict-align archs aligned access is much faster than unaligned access.
415360523Sdelphij//
416360523Sdelphij// Just like in the unaligned case, memcpy() is needed to avoid
417360523Sdelphij// strict aliasing violations. However, on archs that don't support
418360523Sdelphij// unaligned access the compiler cannot know that the pointers given
419360523Sdelphij// to memcpy() are aligned which results in slow code. As of C11 there is
420360523Sdelphij// no standard way to tell the compiler that we know that the address is
421360523Sdelphij// aligned but some compilers have language extensions to do that. With
422360523Sdelphij// such language extensions the memcpy() method gives excellent results.
423360523Sdelphij//
424360523Sdelphij// What to do on a strict-align system when no known language extentensions
425360523Sdelphij// are available? Falling back to byte-by-byte access would be safe but ruin
426360523Sdelphij// optimizations that have been made specifically with aligned access in mind.
427360523Sdelphij// As a compromise, aligned reads will fall back to non-compliant type punning
428360523Sdelphij// but aligned writes will be byte-by-byte, that is, fast reads are preferred
429360523Sdelphij// over fast writes. This obviously isn't great but hopefully it's a working
430360523Sdelphij// compromise for now.
431360523Sdelphij//
432360523Sdelphij// __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
433360523Sdelphij#ifdef HAVE___BUILTIN_ASSUME_ALIGNED
434360523Sdelphij#	define tuklib_memcpy_aligned(dest, src, size) \
435360523Sdelphij		memcpy(dest, __builtin_assume_aligned(src, size), size)
436360523Sdelphij#else
437360523Sdelphij#	define tuklib_memcpy_aligned(dest, src, size) \
438360523Sdelphij		memcpy(dest, src, size)
439360523Sdelphij#	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
440360523Sdelphij#		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
441360523Sdelphij#	endif
442207753Smm#endif
443207753Smm
444207753Smm
445360523Sdelphijstatic inline uint16_t
446360523Sdelphijaligned_read16ne(const uint8_t *buf)
447360523Sdelphij{
448360523Sdelphij#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
449360523Sdelphij		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
450360523Sdelphij	return *(const uint16_t *)buf;
451360523Sdelphij#else
452360523Sdelphij	uint16_t num;
453360523Sdelphij	tuklib_memcpy_aligned(&num, buf, sizeof(num));
454360523Sdelphij	return num;
455360523Sdelphij#endif
456360523Sdelphij}
457360523Sdelphij
458360523Sdelphij
459207753Smmstatic inline uint32_t
460360523Sdelphijaligned_read32ne(const uint8_t *buf)
461360523Sdelphij{
462360523Sdelphij#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
463360523Sdelphij		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
464360523Sdelphij	return *(const uint32_t *)buf;
465360523Sdelphij#else
466360523Sdelphij	uint32_t num;
467360523Sdelphij	tuklib_memcpy_aligned(&num, buf, sizeof(num));
468360523Sdelphij	return num;
469360523Sdelphij#endif
470360523Sdelphij}
471360523Sdelphij
472360523Sdelphij
473360523Sdelphijstatic inline uint64_t
474360523Sdelphijaligned_read64ne(const uint8_t *buf)
475360523Sdelphij{
476360523Sdelphij#if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
477360523Sdelphij		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
478360523Sdelphij	return *(const uint64_t *)buf;
479360523Sdelphij#else
480360523Sdelphij	uint64_t num;
481360523Sdelphij	tuklib_memcpy_aligned(&num, buf, sizeof(num));
482360523Sdelphij	return num;
483360523Sdelphij#endif
484360523Sdelphij}
485360523Sdelphij
486360523Sdelphij
487360523Sdelphijstatic inline void
488360523Sdelphijaligned_write16ne(uint8_t *buf, uint16_t num)
489360523Sdelphij{
490360523Sdelphij#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
491360523Sdelphij	*(uint16_t *)buf = num;
492360523Sdelphij#else
493360523Sdelphij	tuklib_memcpy_aligned(buf, &num, sizeof(num));
494360523Sdelphij#endif
495360523Sdelphij	return;
496360523Sdelphij}
497360523Sdelphij
498360523Sdelphij
499360523Sdelphijstatic inline void
500360523Sdelphijaligned_write32ne(uint8_t *buf, uint32_t num)
501360523Sdelphij{
502360523Sdelphij#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
503360523Sdelphij	*(uint32_t *)buf = num;
504360523Sdelphij#else
505360523Sdelphij	tuklib_memcpy_aligned(buf, &num, sizeof(num));
506360523Sdelphij#endif
507360523Sdelphij	return;
508360523Sdelphij}
509360523Sdelphij
510360523Sdelphij
511360523Sdelphijstatic inline void
512360523Sdelphijaligned_write64ne(uint8_t *buf, uint64_t num)
513360523Sdelphij{
514360523Sdelphij#ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
515360523Sdelphij	*(uint64_t *)buf = num;
516360523Sdelphij#else
517360523Sdelphij	tuklib_memcpy_aligned(buf, &num, sizeof(num));
518360523Sdelphij#endif
519360523Sdelphij	return;
520360523Sdelphij}
521360523Sdelphij
522360523Sdelphij
523360523Sdelphijstatic inline uint16_t
524360523Sdelphijaligned_read16be(const uint8_t *buf)
525360523Sdelphij{
526360523Sdelphij	uint16_t num = aligned_read16ne(buf);
527360523Sdelphij	return conv16be(num);
528360523Sdelphij}
529360523Sdelphij
530360523Sdelphij
531360523Sdelphijstatic inline uint16_t
532360523Sdelphijaligned_read16le(const uint8_t *buf)
533360523Sdelphij{
534360523Sdelphij	uint16_t num = aligned_read16ne(buf);
535360523Sdelphij	return conv16le(num);
536360523Sdelphij}
537360523Sdelphij
538360523Sdelphij
539360523Sdelphijstatic inline uint32_t
540360523Sdelphijaligned_read32be(const uint8_t *buf)
541360523Sdelphij{
542360523Sdelphij	uint32_t num = aligned_read32ne(buf);
543360523Sdelphij	return conv32be(num);
544360523Sdelphij}
545360523Sdelphij
546360523Sdelphij
547360523Sdelphijstatic inline uint32_t
548360523Sdelphijaligned_read32le(const uint8_t *buf)
549360523Sdelphij{
550360523Sdelphij	uint32_t num = aligned_read32ne(buf);
551360523Sdelphij	return conv32le(num);
552360523Sdelphij}
553360523Sdelphij
554360523Sdelphij
555360523Sdelphijstatic inline uint64_t
556360523Sdelphijaligned_read64be(const uint8_t *buf)
557360523Sdelphij{
558360523Sdelphij	uint64_t num = aligned_read64ne(buf);
559360523Sdelphij	return conv64be(num);
560360523Sdelphij}
561360523Sdelphij
562360523Sdelphij
563360523Sdelphijstatic inline uint64_t
564360523Sdelphijaligned_read64le(const uint8_t *buf)
565360523Sdelphij{
566360523Sdelphij	uint64_t num = aligned_read64ne(buf);
567360523Sdelphij	return conv64le(num);
568360523Sdelphij}
569360523Sdelphij
570360523Sdelphij
571360523Sdelphij// These need to be macros like in the unaligned case.
572360523Sdelphij#define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
573360523Sdelphij#define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
574360523Sdelphij#define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
575360523Sdelphij#define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
576360523Sdelphij#define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
577360523Sdelphij#define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
578360523Sdelphij
579360523Sdelphij
580360523Sdelphij////////////////////
581360523Sdelphij// Bit operations //
582360523Sdelphij////////////////////
583360523Sdelphij
584360523Sdelphijstatic inline uint32_t
585207753Smmbsr32(uint32_t n)
586207753Smm{
587207753Smm	// Check for ICC first, since it tends to define __GNUC__ too.
588207753Smm#if defined(__INTEL_COMPILER)
589207753Smm	return _bit_scan_reverse(n);
590207753Smm
591207753Smm#elif TUKLIB_GNUC_REQ(3, 4) && UINT_MAX == UINT32_MAX
592207753Smm	// GCC >= 3.4 has __builtin_clz(), which gives good results on
593207753Smm	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
594207753Smm	// either plain BSR (so the XOR gets optimized away) or LZCNT and
595207753Smm	// XOR (if -march indicates that SSE4a instructions are supported).
596360523Sdelphij	return (uint32_t)__builtin_clz(n) ^ 31U;
597207753Smm
598207753Smm#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
599207753Smm	uint32_t i;
600207753Smm	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
601207753Smm	return i;
602207753Smm
603360523Sdelphij#elif defined(_MSC_VER)
604360523Sdelphij	unsigned long i;
605360523Sdelphij	_BitScanReverse(&i, n);
606207753Smm	return i;
607207753Smm
608207753Smm#else
609207753Smm	uint32_t i = 31;
610207753Smm
611360523Sdelphij	if ((n & 0xFFFF0000) == 0) {
612207753Smm		n <<= 16;
613207753Smm		i = 15;
614207753Smm	}
615207753Smm
616360523Sdelphij	if ((n & 0xFF000000) == 0) {
617207753Smm		n <<= 8;
618207753Smm		i -= 8;
619207753Smm	}
620207753Smm
621360523Sdelphij	if ((n & 0xF0000000) == 0) {
622207753Smm		n <<= 4;
623207753Smm		i -= 4;
624207753Smm	}
625207753Smm
626360523Sdelphij	if ((n & 0xC0000000) == 0) {
627207753Smm		n <<= 2;
628207753Smm		i -= 2;
629207753Smm	}
630207753Smm
631360523Sdelphij	if ((n & 0x80000000) == 0)
632207753Smm		--i;
633207753Smm
634207753Smm	return i;
635207753Smm#endif
636207753Smm}
637207753Smm
638207753Smm
639207753Smmstatic inline uint32_t
640207753Smmclz32(uint32_t n)
641207753Smm{
642207753Smm#if defined(__INTEL_COMPILER)
643207753Smm	return _bit_scan_reverse(n) ^ 31U;
644207753Smm
645207753Smm#elif TUKLIB_GNUC_REQ(3, 4) && UINT_MAX == UINT32_MAX
646360523Sdelphij	return (uint32_t)__builtin_clz(n);
647207753Smm
648207753Smm#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
649207753Smm	uint32_t i;
650207753Smm	__asm__("bsrl %1, %0\n\t"
651207753Smm		"xorl $31, %0"
652207753Smm		: "=r" (i) : "rm" (n));
653207753Smm	return i;
654207753Smm
655360523Sdelphij#elif defined(_MSC_VER)
656360523Sdelphij	unsigned long i;
657360523Sdelphij	_BitScanReverse(&i, n);
658207753Smm	return i ^ 31U;
659207753Smm
660207753Smm#else
661207753Smm	uint32_t i = 0;
662207753Smm
663360523Sdelphij	if ((n & 0xFFFF0000) == 0) {
664207753Smm		n <<= 16;
665207753Smm		i = 16;
666207753Smm	}
667207753Smm
668360523Sdelphij	if ((n & 0xFF000000) == 0) {
669207753Smm		n <<= 8;
670207753Smm		i += 8;
671207753Smm	}
672207753Smm
673360523Sdelphij	if ((n & 0xF0000000) == 0) {
674207753Smm		n <<= 4;
675207753Smm		i += 4;
676207753Smm	}
677207753Smm
678360523Sdelphij	if ((n & 0xC0000000) == 0) {
679207753Smm		n <<= 2;
680207753Smm		i += 2;
681207753Smm	}
682207753Smm
683360523Sdelphij	if ((n & 0x80000000) == 0)
684207753Smm		++i;
685207753Smm
686207753Smm	return i;
687207753Smm#endif
688207753Smm}
689207753Smm
690207753Smm
691207753Smmstatic inline uint32_t
692207753Smmctz32(uint32_t n)
693207753Smm{
694207753Smm#if defined(__INTEL_COMPILER)
695207753Smm	return _bit_scan_forward(n);
696207753Smm
697207753Smm#elif TUKLIB_GNUC_REQ(3, 4) && UINT_MAX >= UINT32_MAX
698360523Sdelphij	return (uint32_t)__builtin_ctz(n);
699207753Smm
700207753Smm#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
701207753Smm	uint32_t i;
702207753Smm	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
703207753Smm	return i;
704207753Smm
705360523Sdelphij#elif defined(_MSC_VER)
706360523Sdelphij	unsigned long i;
707360523Sdelphij	_BitScanForward(&i, n);
708207753Smm	return i;
709207753Smm
710207753Smm#else
711207753Smm	uint32_t i = 0;
712207753Smm
713360523Sdelphij	if ((n & 0x0000FFFF) == 0) {
714207753Smm		n >>= 16;
715207753Smm		i = 16;
716207753Smm	}
717207753Smm
718360523Sdelphij	if ((n & 0x000000FF) == 0) {
719207753Smm		n >>= 8;
720207753Smm		i += 8;
721207753Smm	}
722207753Smm
723360523Sdelphij	if ((n & 0x0000000F) == 0) {
724207753Smm		n >>= 4;
725207753Smm		i += 4;
726207753Smm	}
727207753Smm
728360523Sdelphij	if ((n & 0x00000003) == 0) {
729207753Smm		n >>= 2;
730207753Smm		i += 2;
731207753Smm	}
732207753Smm
733360523Sdelphij	if ((n & 0x00000001) == 0)
734207753Smm		++i;
735207753Smm
736207753Smm	return i;
737207753Smm#endif
738207753Smm}
739207753Smm
740207753Smm#define bsf32 ctz32
741207753Smm
742207753Smm#endif
743