198944Sobrien/* SPDX-License-Identifier: GPL-2.0-only */
246283Sdfr/*
398944Sobrien-*- linux-c -*-
446283Sdfr   drbd_receiver.c
5130803Smarcel   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6130803Smarcel
7130803Smarcel   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
898944Sobrien   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
946283Sdfr   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
1098944Sobrien
1198944Sobrien */
1298944Sobrien
1398944Sobrien#ifndef _DRBD_VLI_H
1446283Sdfr#define _DRBD_VLI_H
1598944Sobrien
1698944Sobrien/*
1798944Sobrien * At a granularity of 4KiB storage represented per bit,
1898944Sobrien * and stroage sizes of several TiB,
1946283Sdfr * and possibly small-bandwidth replication,
2098944Sobrien * the bitmap transfer time can take much too long,
2198944Sobrien * if transmitted in plain text.
2298944Sobrien *
2398944Sobrien * We try to reduce the transferred bitmap information
2498944Sobrien * by encoding runlengths of bit polarity.
2598944Sobrien *
2698944Sobrien * We never actually need to encode a "zero" (runlengths are positive).
2798944Sobrien * But then we have to store the value of the first bit.
2898944Sobrien * The first bit of information thus shall encode if the first runlength
2998944Sobrien * gives the number of set or unset bits.
3098944Sobrien *
3198944Sobrien * We assume that large areas are either completely set or unset,
3298944Sobrien * which gives good compression with any runlength method,
3398944Sobrien * even when encoding the runlength as fixed size 32bit/64bit integers.
3498944Sobrien *
3598944Sobrien * Still, there may be areas where the polarity flips every few bits,
3698944Sobrien * and encoding the runlength sequence of those areas with fix size
3746283Sdfr * integers would be much worse than plaintext.
3846283Sdfr *
3946283Sdfr * We want to encode small runlength values with minimum code length,
40130803Smarcel * while still being able to encode a Huge run of all zeros.
41130803Smarcel *
4298944Sobrien * Thus we need a Variable Length Integer encoding, VLI.
4398944Sobrien *
4498944Sobrien * For some cases, we produce more code bits than plaintext input.
4598944Sobrien * We need to send incompressible chunks as plaintext, skip over them
46130803Smarcel * and then see if the next chunk compresses better.
47130803Smarcel *
48130803Smarcel * We don't care too much about "excellent" compression ratio for large
49130803Smarcel * runlengths (all set/all clear): whether we achieve a factor of 100
50130803Smarcel * or 1000 is not that much of an issue.
5146283Sdfr * We do not want to waste too much on short runlengths in the "noisy"
5298944Sobrien * parts of the bitmap, though.
5346283Sdfr *
5498944Sobrien * There are endless variants of VLI, we experimented with:
5598944Sobrien *  * simple byte-based
5698944Sobrien *  * various bit based with different code word length.
5798944Sobrien *
5898944Sobrien * To avoid yet an other configuration parameter (choice of bitmap compression
5998944Sobrien * algorithm) which was difficult to explain and tune, we just chose the one
6098944Sobrien * variant that turned out best in all test cases.
6198944Sobrien * Based on real world usage patterns, with device sizes ranging from a few GiB
6298944Sobrien * to several TiB, file server/mailserver/webserver/mysql/postgress,
6398944Sobrien * mostly idle to really busy, the all time winner (though sometimes only
6498944Sobrien * marginally better) is:
6598944Sobrien */
6698944Sobrien
6798944Sobrien/*
6898944Sobrien * encoding is "visualised" as
6998944Sobrien * __little endian__ bitstream, least significant bit first (left most)
70130803Smarcel *
7198944Sobrien * this particular encoding is chosen so that the prefix code
7298944Sobrien * starts as unary encoding the level, then modified so that
7398944Sobrien * 10 levels can be described in 8bit, with minimal overhead
7498944Sobrien * for the smaller levels.
7598944Sobrien *
7698944Sobrien * Number of data bits follow fibonacci sequence, with the exception of the
7798944Sobrien * last level (+1 data bit, so it makes 64bit total).  The only worse code when
7898944Sobrien * encoding bit polarity runlength is 1 plain bits => 2 code bits.
79130803Smarcelprefix    data bits                                    max val  N�� data bits
8098944Sobrien0 x                                                         0x2            1
8198944Sobrien10 x                                                        0x4            1
82130803Smarcel110 xx                                                      0x8            2
83130803Smarcel1110 xxx                                                   0x10            3
84130803Smarcel11110 xxx xx                                               0x30            5
85130803Smarcel111110 xx xxxxxx                                          0x130            8
86130803Smarcel11111100  xxxxxxxx xxxxx                                 0x2130           13
8798944Sobrien11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
88130803Smarcel11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
89130803Smarcel11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
90130803Smarcel * maximum encodable value: 0x100000400202130 == 2**56 + some */
9198944Sobrien
9298944Sobrien/* compression "table":
9398944Sobrien transmitted   x                                0.29
9498944Sobrien as plaintext x                                  ........................
9598944Sobrien             x                                   ........................
9698944Sobrien            x                                    ........................
97130803Smarcel           x    0.59                         0.21........................
9898944Sobrien          x      ........................................................
9998944Sobrien         x       .. c ...................................................
10098944Sobrien        x    0.44.. o ...................................................
10198944Sobrien       x .......... d ...................................................
10298944Sobrien      x  .......... e ...................................................
10398944Sobrien     X.............   ...................................................
10498944Sobrien    x.............. b ...................................................
10598944Sobrien2.0x............... i ...................................................
106130803Smarcel #X................ t ...................................................
10798944Sobrien #................. s ...........................  plain bits  ..........
10898944Sobrien-+-----------------------------------------------------------------------
10998944Sobrien 1             16              32                              64
11098944Sobrien*/
11198944Sobrien
11298944Sobrien/* LEVEL: (total bits, prefix bits, prefix value),
11398944Sobrien * sorted ascending by number of total bits.
11498944Sobrien * The rest of the code table is calculated at compiletime from this. */
11598944Sobrien
11698944Sobrien/* fibonacci data 1, 1, ... */
117130803Smarcel#define VLI_L_1_1() do { \
11898944Sobrien	LEVEL( 2, 1, 0x00); \
11998944Sobrien	LEVEL( 3, 2, 0x01); \
12098944Sobrien	LEVEL( 5, 3, 0x03); \
12198944Sobrien	LEVEL( 7, 4, 0x07); \
12298944Sobrien	LEVEL(10, 5, 0x0f); \
12398944Sobrien	LEVEL(14, 6, 0x1f); \
12498944Sobrien	LEVEL(21, 8, 0x3f); \
12598944Sobrien	LEVEL(29, 8, 0x7f); \
12698944Sobrien	LEVEL(42, 8, 0xbf); \
12798944Sobrien	LEVEL(64, 8, 0xff); \
128130803Smarcel	} while (0)
12998944Sobrien
13098944Sobrien/* finds a suitable level to decode the least significant part of in.
13198944Sobrien * returns number of bits consumed.
13298944Sobrien *
13398944Sobrien * BUG() for bad input, as that would mean a buggy code table. */
13498944Sobrienstatic inline int vli_decode_bits(u64 *out, const u64 in)
13598944Sobrien{
13698944Sobrien	u64 adj = 1;
13798944Sobrien
13898944Sobrien#define LEVEL(t,b,v)					\
13998944Sobrien	do {						\
140130803Smarcel		if ((in & ((1 << b) -1)) == v) {	\
14198944Sobrien			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
14298944Sobrien			return t;			\
14398944Sobrien		}					\
14498944Sobrien		adj += 1ULL << (t - b);			\
14598944Sobrien	} while (0)
14698944Sobrien
14798944Sobrien	VLI_L_1_1();
14898944Sobrien
14998944Sobrien	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
15098944Sobrien	BUG();
151130803Smarcel#undef LEVEL
15298944Sobrien}
15398944Sobrien
15498944Sobrien/* return number of code bits needed,
15598944Sobrien * or negative error number */
15698944Sobrienstatic inline int __vli_encode_bits(u64 *out, const u64 in)
15798944Sobrien{
15898944Sobrien	u64 max = 0;
15998944Sobrien	u64 adj = 1;
16098944Sobrien
16198944Sobrien	if (in == 0)
162130803Smarcel		return -EINVAL;
16398944Sobrien
16498944Sobrien#define LEVEL(t,b,v) do {		\
16598944Sobrien		max += 1ULL << (t - b);	\
16698944Sobrien		if (in <= max) {	\
16798944Sobrien			if (out)	\
16898944Sobrien				*out = ((in - adj) << b) | v;	\
16998944Sobrien			return t;	\
17098944Sobrien		}			\
17198944Sobrien		adj = max + 1;		\
17298944Sobrien	} while (0)
173130803Smarcel
17498944Sobrien	VLI_L_1_1();
17598944Sobrien
17698944Sobrien	return -EOVERFLOW;
17798944Sobrien#undef LEVEL
17898944Sobrien}
17998944Sobrien
18098944Sobrien#undef VLI_L_1_1
18198944Sobrien
18298944Sobrien/* code from here down is independend of actually used bit code */
18398944Sobrien
18498944Sobrien/*
18598944Sobrien * Code length is determined by some unique (e.g. unary) prefix.
18698944Sobrien * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
18798944Sobrien * not a byte stream.
18898944Sobrien */
18998944Sobrien
19098944Sobrien/* for the bitstream, we need a cursor */
19198944Sobrienstruct bitstream_cursor {
192130803Smarcel	/* the current byte */
19398944Sobrien	u8 *b;
19498944Sobrien	/* the current bit within *b, nomalized: 0..7 */
19598944Sobrien	unsigned int bit;
19698944Sobrien};
19798944Sobrien
19898944Sobrien/* initialize cursor to point to first bit of stream */
19998944Sobrienstatic inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
20098944Sobrien{
20198944Sobrien	cur->b = s;
20298944Sobrien	cur->bit = 0;
203130803Smarcel}
20498944Sobrien
20598944Sobrien/* advance cursor by that many bits; maximum expected input value: 64,
20698944Sobrien * but depending on VLI implementation, it may be more. */
20798944Sobrienstatic inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
20898944Sobrien{
20998944Sobrien	bits += cur->bit;
21098944Sobrien	cur->b = cur->b + (bits >> 3);
21198944Sobrien	cur->bit = bits & 7;
21298944Sobrien}
21398944Sobrien
214130803Smarcel/* the bitstream itself knows its length */
21598944Sobrienstruct bitstream {
21698944Sobrien	struct bitstream_cursor cur;
21798944Sobrien	unsigned char *buf;
21898944Sobrien	size_t buf_len;		/* in bytes */
21998944Sobrien
22098944Sobrien	/* for input stream:
22198944Sobrien	 * number of trailing 0 bits for padding
22298944Sobrien	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
22398944Sobrien	unsigned int pad_bits;
22498944Sobrien};
225130803Smarcel
22698944Sobrienstatic inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
22798944Sobrien{
228130803Smarcel	bs->buf = s;
229130803Smarcel	bs->buf_len = len;
230130803Smarcel	bs->pad_bits = pad_bits;
231130803Smarcel	bitstream_cursor_reset(&bs->cur, bs->buf);
232130803Smarcel}
23398944Sobrien
234130803Smarcelstatic inline void bitstream_rewind(struct bitstream *bs)
23598944Sobrien{
236130803Smarcel	bitstream_cursor_reset(&bs->cur, bs->buf);
237130803Smarcel	memset(bs->buf, 0, bs->buf_len);
238130803Smarcel}
23998944Sobrien
240130803Smarcel/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
241130803Smarcel * Ignores "pad_bits".
242130803Smarcel * Returns zero if bits == 0 (nothing to do).
24398944Sobrien * Returns number of bits used if successful.
24498944Sobrien *
24598944Sobrien * If there is not enough room left in bitstream,
24698944Sobrien * leaves bitstream unchanged and returns -ENOBUFS.
24798944Sobrien */
24898944Sobrienstatic inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
24998944Sobrien{
250130803Smarcel	unsigned char *b = bs->cur.b;
25198944Sobrien	unsigned int tmp;
25298944Sobrien
25398944Sobrien	if (bits == 0)
25498944Sobrien		return 0;
25598944Sobrien
25698944Sobrien	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
25798944Sobrien		return -ENOBUFS;
25898944Sobrien
25998944Sobrien	/* paranoia: strip off hi bits; they should not be set anyways. */
260130803Smarcel	if (bits < 64)
26198944Sobrien		val &= ~0ULL >> (64 - bits);
26298944Sobrien
26398944Sobrien	*b++ |= (val & 0xff) << bs->cur.bit;
264130803Smarcel
26598944Sobrien	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
266130803Smarcel		*b++ |= (val >> tmp) & 0xff;
267130803Smarcel
268130803Smarcel	bitstream_cursor_advance(&bs->cur, bits);
269130803Smarcel	return bits;
27098944Sobrien}
27198944Sobrien
27298944Sobrien/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
273130803Smarcel *
274130803Smarcel * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
275130803Smarcel *
27698944Sobrien * If there are less than the requested number of valid bits left in the
277130803Smarcel * bitstream, still fetches all available bits.
278130803Smarcel *
27998944Sobrien * Returns number of actually fetched bits.
28098944Sobrien */
28198944Sobrienstatic inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
28298944Sobrien{
28398944Sobrien	u64 val;
28498944Sobrien	unsigned int n;
28598944Sobrien
28698944Sobrien	if (bits > 64)
287130803Smarcel		return -EINVAL;
28898944Sobrien
28998944Sobrien	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
29098944Sobrien		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
29198944Sobrien			- bs->cur.bit - bs->pad_bits;
29298944Sobrien
29398944Sobrien	if (bits == 0) {
29498944Sobrien		*out = 0;
29598944Sobrien		return 0;
29698944Sobrien	}
29798944Sobrien
29898944Sobrien	/* get the high bits */
29998944Sobrien	val = 0;
30098944Sobrien	n = (bs->cur.bit + bits + 7) >> 3;
301130803Smarcel	/* n may be at most 9, if cur.bit + bits > 64 */
30298944Sobrien	/* which means this copies at most 8 byte */
30398944Sobrien	if (n) {
30498944Sobrien		memcpy(&val, bs->cur.b+1, n - 1);
305130803Smarcel		val = le64_to_cpu(val) << (8 - bs->cur.bit);
30698944Sobrien	}
307130803Smarcel
308130803Smarcel	/* we still need the low bits */
309130803Smarcel	val |= bs->cur.b[0] >> bs->cur.bit;
31098944Sobrien
311130803Smarcel	/* and mask out bits we don't want */
31298944Sobrien	val &= ~0ULL >> (64 - bits);
313130803Smarcel
314130803Smarcel	bitstream_cursor_advance(&bs->cur, bits);
315130803Smarcel	*out = val;
31698944Sobrien
31798944Sobrien	return bits;
31898944Sobrien}
31998944Sobrien
32098944Sobrien/* encodes @in as vli into @bs;
32198944Sobrien
322130803Smarcel * return values
32398944Sobrien *  > 0: number of bits successfully stored in bitstream
32498944Sobrien * -ENOBUFS @bs is full
32598944Sobrien * -EINVAL input zero (invalid)
32698944Sobrien * -EOVERFLOW input too large for this vli code (invalid)
32798944Sobrien */
32898944Sobrienstatic inline int vli_encode_bits(struct bitstream *bs, u64 in)
32998944Sobrien{
33098944Sobrien	u64 code;
33198944Sobrien	int bits = __vli_encode_bits(&code, in);
33298944Sobrien
33398944Sobrien	if (bits <= 0)
33498944Sobrien		return bits;
33598944Sobrien
336130803Smarcel	return bitstream_put_bits(bs, code, bits);
33798944Sobrien}
33898944Sobrien
33998944Sobrien#endif
340130803Smarcel