1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Historical Service Time
4 *
5 *  Keeps a time-weighted exponential moving average of the historical
6 *  service time. Estimates future service time based on the historical
7 *  service time and the number of outstanding requests.
8 *
9 *  Marks paths stale if they have not finished within hst *
10 *  num_paths. If a path is stale and unused, we will send a single
11 *  request to probe in case the path has improved. This situation
12 *  generally arises if the path is so much worse than others that it
13 *  will never have the best estimated service time, or if the entire
14 *  multipath device is unused. If a path is stale and in use, limit the
15 *  number of requests it can receive with the assumption that the path
16 *  has become degraded.
17 *
18 *  To avoid repeatedly calculating exponents for time weighting, times
19 *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
20 *  ns, and the weighting is pre-calculated.
21 *
22 */
23
24#include "dm.h"
25#include "dm-path-selector.h"
26
27#include <linux/blkdev.h>
28#include <linux/slab.h>
29#include <linux/module.h>
30
31
32#define DM_MSG_PREFIX	"multipath historical-service-time"
33#define HST_MIN_IO 1
34#define HST_VERSION "0.1.1"
35
36#define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
37#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
38#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
39#define HST_FIXED_95 972
40#define HST_MAX_INFLIGHT HST_FIXED_1
41#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
42#define HST_WEIGHT_COUNT 64ULL
43
44struct selector {
45	struct list_head valid_paths;
46	struct list_head failed_paths;
47	int valid_count;
48	spinlock_t lock;
49
50	unsigned int weights[HST_WEIGHT_COUNT];
51	unsigned int threshold_multiplier;
52};
53
54struct path_info {
55	struct list_head list;
56	struct dm_path *path;
57	unsigned int repeat_count;
58
59	spinlock_t lock;
60
61	u64 historical_service_time; /* Fixed point */
62
63	u64 stale_after;
64	u64 last_finish;
65
66	u64 outstanding;
67};
68
69/**
70 * fixed_power - compute: x^n, in O(log n) time
71 *
72 * @x:         base of the power
73 * @frac_bits: fractional bits of @x
74 * @n:         power to raise @x to.
75 *
76 * By exploiting the relation between the definition of the natural power
77 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
78 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
79 * (where: n_i \elem {0, 1}, the binary vector representing n),
80 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
81 * of course trivially computable in O(log_2 n), the length of our binary
82 * vector.
83 *
84 * (see: kernel/sched/loadavg.c)
85 */
86static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
87{
88	unsigned long result = 1UL << frac_bits;
89
90	if (n) {
91		for (;;) {
92			if (n & 1) {
93				result *= x;
94				result += 1UL << (frac_bits - 1);
95				result >>= frac_bits;
96			}
97			n >>= 1;
98			if (!n)
99				break;
100			x *= x;
101			x += 1UL << (frac_bits - 1);
102			x >>= frac_bits;
103		}
104	}
105
106	return result;
107}
108
109/*
110 * Calculate the next value of an exponential moving average
111 * a_1 = a_0 * e + a * (1 - e)
112 *
113 * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
114 * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
115 * @weight: [0, HST_FIXED_1]
116 *
117 * Note:
118 *   To account for multiple periods in the same calculation,
119 *   a_n = a_0 * e^n + a * (1 - e^n),
120 *   so call fixed_ema(last, next, pow(weight, N))
121 */
122static u64 fixed_ema(u64 last, u64 next, u64 weight)
123{
124	last *= weight;
125	last += next * (HST_FIXED_1 - weight);
126	last += 1ULL << (HST_FIXED_SHIFT - 1);
127	return last >> HST_FIXED_SHIFT;
128}
129
130static struct selector *alloc_selector(void)
131{
132	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
133
134	if (s) {
135		INIT_LIST_HEAD(&s->valid_paths);
136		INIT_LIST_HEAD(&s->failed_paths);
137		spin_lock_init(&s->lock);
138		s->valid_count = 0;
139	}
140
141	return s;
142}
143
144/*
145 * Get the weight for a given time span.
146 */
147static u64 hst_weight(struct path_selector *ps, u64 delta)
148{
149	struct selector *s = ps->context;
150	int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
151			   HST_WEIGHT_COUNT - 1);
152
153	return s->weights[bucket];
154}
155
156/*
157 * Set up the weights array.
158 *
159 * weights[len-1] = 0
160 * weights[n] = base ^ (n + 1)
161 */
162static void hst_set_weights(struct path_selector *ps, unsigned int base)
163{
164	struct selector *s = ps->context;
165	int i;
166
167	if (base >= HST_FIXED_1)
168		return;
169
170	for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
171		s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
172	s->weights[HST_WEIGHT_COUNT - 1] = 0;
173}
174
175static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
176{
177	struct selector *s;
178	unsigned int base_weight = HST_FIXED_95;
179	unsigned int threshold_multiplier = 0;
180	char dummy;
181
182	/*
183	 * Arguments: [<base_weight> [<threshold_multiplier>]]
184	 *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
185	 *                  value of 0 will completely ignore any history.
186	 *                  If not given, default (HST_FIXED_95) is used.
187	 *   <threshold_multiplier>: Minimum threshold multiplier for paths to
188	 *                  be considered different. That is, a path is
189	 *                  considered different iff (p1 > N * p2) where p1
190	 *                  is the path with higher service time. A threshold
191	 *                  of 1 or 0 has no effect. Defaults to 0.
192	 */
193	if (argc > 2)
194		return -EINVAL;
195
196	if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
197	     base_weight >= HST_FIXED_1)) {
198		return -EINVAL;
199	}
200
201	if (argc > 1 && (sscanf(argv[1], "%u%c",
202				&threshold_multiplier, &dummy) != 1)) {
203		return -EINVAL;
204	}
205
206	s = alloc_selector();
207	if (!s)
208		return -ENOMEM;
209
210	ps->context = s;
211
212	hst_set_weights(ps, base_weight);
213	s->threshold_multiplier = threshold_multiplier;
214	return 0;
215}
216
217static void free_paths(struct list_head *paths)
218{
219	struct path_info *pi, *next;
220
221	list_for_each_entry_safe(pi, next, paths, list) {
222		list_del(&pi->list);
223		kfree(pi);
224	}
225}
226
227static void hst_destroy(struct path_selector *ps)
228{
229	struct selector *s = ps->context;
230
231	free_paths(&s->valid_paths);
232	free_paths(&s->failed_paths);
233	kfree(s);
234	ps->context = NULL;
235}
236
237static int hst_status(struct path_selector *ps, struct dm_path *path,
238		     status_type_t type, char *result, unsigned int maxlen)
239{
240	unsigned int sz = 0;
241	struct path_info *pi;
242
243	if (!path) {
244		struct selector *s = ps->context;
245
246		DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
247	} else {
248		pi = path->pscontext;
249
250		switch (type) {
251		case STATUSTYPE_INFO:
252			DMEMIT("%llu %llu %llu ", pi->historical_service_time,
253			       pi->outstanding, pi->stale_after);
254			break;
255		case STATUSTYPE_TABLE:
256			DMEMIT("0 ");
257			break;
258		case STATUSTYPE_IMA:
259			*result = '\0';
260			break;
261		}
262	}
263
264	return sz;
265}
266
267static int hst_add_path(struct path_selector *ps, struct dm_path *path,
268		       int argc, char **argv, char **error)
269{
270	struct selector *s = ps->context;
271	struct path_info *pi;
272	unsigned int repeat_count = HST_MIN_IO;
273	char dummy;
274	unsigned long flags;
275
276	/*
277	 * Arguments: [<repeat_count>]
278	 *   <repeat_count>: The number of I/Os before switching path.
279	 *                   If not given, default (HST_MIN_IO) is used.
280	 */
281	if (argc > 1) {
282		*error = "historical-service-time ps: incorrect number of arguments";
283		return -EINVAL;
284	}
285
286	if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
287		*error = "historical-service-time ps: invalid repeat count";
288		return -EINVAL;
289	}
290
291	/* allocate the path */
292	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
293	if (!pi) {
294		*error = "historical-service-time ps: Error allocating path context";
295		return -ENOMEM;
296	}
297
298	pi->path = path;
299	pi->repeat_count = repeat_count;
300
301	pi->historical_service_time = HST_FIXED_1;
302
303	spin_lock_init(&pi->lock);
304	pi->outstanding = 0;
305
306	pi->stale_after = 0;
307	pi->last_finish = 0;
308
309	path->pscontext = pi;
310
311	spin_lock_irqsave(&s->lock, flags);
312	list_add_tail(&pi->list, &s->valid_paths);
313	s->valid_count++;
314	spin_unlock_irqrestore(&s->lock, flags);
315
316	return 0;
317}
318
319static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
320{
321	struct selector *s = ps->context;
322	struct path_info *pi = path->pscontext;
323	unsigned long flags;
324
325	spin_lock_irqsave(&s->lock, flags);
326	list_move(&pi->list, &s->failed_paths);
327	s->valid_count--;
328	spin_unlock_irqrestore(&s->lock, flags);
329}
330
331static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
332{
333	struct selector *s = ps->context;
334	struct path_info *pi = path->pscontext;
335	unsigned long flags;
336
337	spin_lock_irqsave(&s->lock, flags);
338	list_move_tail(&pi->list, &s->valid_paths);
339	s->valid_count++;
340	spin_unlock_irqrestore(&s->lock, flags);
341
342	return 0;
343}
344
345static void hst_fill_compare(struct path_info *pi, u64 *hst,
346			     u64 *out, u64 *stale)
347{
348	unsigned long flags;
349
350	spin_lock_irqsave(&pi->lock, flags);
351	*hst = pi->historical_service_time;
352	*out = pi->outstanding;
353	*stale = pi->stale_after;
354	spin_unlock_irqrestore(&pi->lock, flags);
355}
356
357/*
358 * Compare the estimated service time of 2 paths, pi1 and pi2,
359 * for the incoming I/O.
360 *
361 * Returns:
362 * < 0 : pi1 is better
363 * 0   : no difference between pi1 and pi2
364 * > 0 : pi2 is better
365 *
366 */
367static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
368			     u64 time_now, struct path_selector *ps)
369{
370	struct selector *s = ps->context;
371	u64 hst1, hst2;
372	long long out1, out2, stale1, stale2;
373	int pi2_better, over_threshold;
374
375	hst_fill_compare(pi1, &hst1, &out1, &stale1);
376	hst_fill_compare(pi2, &hst2, &out2, &stale2);
377
378	/* Check here if estimated latency for two paths are too similar.
379	 * If this is the case, we skip extra calculation and just compare
380	 * outstanding requests. In this case, any unloaded paths will
381	 * be preferred.
382	 */
383	if (hst1 > hst2)
384		over_threshold = hst1 > (s->threshold_multiplier * hst2);
385	else
386		over_threshold = hst2 > (s->threshold_multiplier * hst1);
387
388	if (!over_threshold)
389		return out1 - out2;
390
391	/*
392	 * If an unloaded path is stale, choose it. If both paths are unloaded,
393	 * choose path that is the most stale.
394	 * (If one path is loaded, choose the other)
395	 */
396	if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
397	    (!out1 && !out2))
398		return (!out2 * stale1) - (!out1 * stale2);
399
400	/* Compare estimated service time. If outstanding is the same, we
401	 * don't need to multiply
402	 */
403	if (out1 == out2) {
404		pi2_better = hst1 > hst2;
405	} else {
406		/* Potential overflow with out >= 1024 */
407		if (unlikely(out1 >= HST_MAX_INFLIGHT ||
408			     out2 >= HST_MAX_INFLIGHT)) {
409			/* If over 1023 in-flights, we may overflow if hst
410			 * is at max. (With this shift we still overflow at
411			 * 1048576 in-flights, which is high enough).
412			 */
413			hst1 >>= HST_FIXED_SHIFT;
414			hst2 >>= HST_FIXED_SHIFT;
415		}
416		pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
417	}
418
419	/* In the case that the 'winner' is stale, limit to equal usage. */
420	if (pi2_better) {
421		if (stale2 < time_now)
422			return out1 - out2;
423		return 1;
424	}
425	if (stale1 < time_now)
426		return out1 - out2;
427	return -1;
428}
429
430static struct dm_path *hst_select_path(struct path_selector *ps,
431				       size_t nr_bytes)
432{
433	struct selector *s = ps->context;
434	struct path_info *pi = NULL, *best = NULL;
435	u64 time_now = ktime_get_ns();
436	struct dm_path *ret = NULL;
437	unsigned long flags;
438
439	spin_lock_irqsave(&s->lock, flags);
440	if (list_empty(&s->valid_paths))
441		goto out;
442
443	list_for_each_entry(pi, &s->valid_paths, list) {
444		if (!best || (hst_compare(pi, best, time_now, ps) < 0))
445			best = pi;
446	}
447
448	if (!best)
449		goto out;
450
451	/* Move last used path to end (least preferred in case of ties) */
452	list_move_tail(&best->list, &s->valid_paths);
453
454	ret = best->path;
455
456out:
457	spin_unlock_irqrestore(&s->lock, flags);
458	return ret;
459}
460
461static int hst_start_io(struct path_selector *ps, struct dm_path *path,
462			size_t nr_bytes)
463{
464	struct path_info *pi = path->pscontext;
465	unsigned long flags;
466
467	spin_lock_irqsave(&pi->lock, flags);
468	pi->outstanding++;
469	spin_unlock_irqrestore(&pi->lock, flags);
470
471	return 0;
472}
473
474static u64 path_service_time(struct path_info *pi, u64 start_time)
475{
476	u64 now = ktime_get_ns();
477
478	/* if a previous disk request has finished after this IO was
479	 * sent to the hardware, pretend the submission happened
480	 * serially.
481	 */
482	if (time_after64(pi->last_finish, start_time))
483		start_time = pi->last_finish;
484
485	pi->last_finish = now;
486	if (time_before64(now, start_time))
487		return 0;
488
489	return now - start_time;
490}
491
492static int hst_end_io(struct path_selector *ps, struct dm_path *path,
493		      size_t nr_bytes, u64 start_time)
494{
495	struct path_info *pi = path->pscontext;
496	struct selector *s = ps->context;
497	unsigned long flags;
498	u64 st;
499
500	spin_lock_irqsave(&pi->lock, flags);
501
502	st = path_service_time(pi, start_time);
503	pi->outstanding--;
504	pi->historical_service_time =
505		fixed_ema(pi->historical_service_time,
506			  min(st * HST_FIXED_1, HST_FIXED_MAX),
507			  hst_weight(ps, st));
508
509	/*
510	 * On request end, mark path as fresh. If a path hasn't
511	 * finished any requests within the fresh period, the estimated
512	 * service time is considered too optimistic and we limit the
513	 * maximum requests on that path.
514	 */
515	pi->stale_after = pi->last_finish +
516		(s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
517
518	spin_unlock_irqrestore(&pi->lock, flags);
519
520	return 0;
521}
522
523static struct path_selector_type hst_ps = {
524	.name		= "historical-service-time",
525	.module		= THIS_MODULE,
526	.features	= DM_PS_USE_HR_TIMER,
527	.table_args	= 1,
528	.info_args	= 3,
529	.create		= hst_create,
530	.destroy	= hst_destroy,
531	.status		= hst_status,
532	.add_path	= hst_add_path,
533	.fail_path	= hst_fail_path,
534	.reinstate_path	= hst_reinstate_path,
535	.select_path	= hst_select_path,
536	.start_io	= hst_start_io,
537	.end_io		= hst_end_io,
538};
539
540static int __init dm_hst_init(void)
541{
542	int r = dm_register_path_selector(&hst_ps);
543
544	if (r < 0)
545		DMERR("register failed %d", r);
546
547	DMINFO("version " HST_VERSION " loaded");
548
549	return r;
550}
551
552static void __exit dm_hst_exit(void)
553{
554	int r = dm_unregister_path_selector(&hst_ps);
555
556	if (r < 0)
557		DMERR("unregister failed %d", r);
558}
559
560module_init(dm_hst_init);
561module_exit(dm_hst_exit);
562
563MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
564MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
565MODULE_LICENSE("GPL");
566