1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2015 Red Hat. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm-cache-background-tracker.h"
9#include "dm-cache-policy-internal.h"
10#include "dm-cache-policy.h"
11#include "dm.h"
12
13#include <linux/hash.h>
14#include <linux/jiffies.h>
15#include <linux/module.h>
16#include <linux/mutex.h>
17#include <linux/vmalloc.h>
18#include <linux/math64.h>
19
20#define DM_MSG_PREFIX "cache-policy-smq"
21
22/*----------------------------------------------------------------*/
23
24/*
25 * Safe division functions that return zero on divide by zero.
26 */
27static unsigned int safe_div(unsigned int n, unsigned int d)
28{
29	return d ? n / d : 0u;
30}
31
32static unsigned int safe_mod(unsigned int n, unsigned int d)
33{
34	return d ? n % d : 0u;
35}
36
37/*----------------------------------------------------------------*/
38
39struct entry {
40	unsigned int hash_next:28;
41	unsigned int prev:28;
42	unsigned int next:28;
43	unsigned int level:6;
44	bool dirty:1;
45	bool allocated:1;
46	bool sentinel:1;
47	bool pending_work:1;
48
49	dm_oblock_t oblock;
50};
51
52/*----------------------------------------------------------------*/
53
54#define INDEXER_NULL ((1u << 28u) - 1u)
55
56/*
57 * An entry_space manages a set of entries that we use for the queues.
58 * The clean and dirty queues share entries, so this object is separate
59 * from the queue itself.
60 */
61struct entry_space {
62	struct entry *begin;
63	struct entry *end;
64};
65
66static int space_init(struct entry_space *es, unsigned int nr_entries)
67{
68	if (!nr_entries) {
69		es->begin = es->end = NULL;
70		return 0;
71	}
72
73	es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry)));
74	if (!es->begin)
75		return -ENOMEM;
76
77	es->end = es->begin + nr_entries;
78	return 0;
79}
80
81static void space_exit(struct entry_space *es)
82{
83	vfree(es->begin);
84}
85
86static struct entry *__get_entry(struct entry_space *es, unsigned int block)
87{
88	struct entry *e;
89
90	e = es->begin + block;
91	BUG_ON(e >= es->end);
92
93	return e;
94}
95
96static unsigned int to_index(struct entry_space *es, struct entry *e)
97{
98	BUG_ON(e < es->begin || e >= es->end);
99	return e - es->begin;
100}
101
102static struct entry *to_entry(struct entry_space *es, unsigned int block)
103{
104	if (block == INDEXER_NULL)
105		return NULL;
106
107	return __get_entry(es, block);
108}
109
110/*----------------------------------------------------------------*/
111
112struct ilist {
113	unsigned int nr_elts;	/* excluding sentinel entries */
114	unsigned int head, tail;
115};
116
117static void l_init(struct ilist *l)
118{
119	l->nr_elts = 0;
120	l->head = l->tail = INDEXER_NULL;
121}
122
123static struct entry *l_head(struct entry_space *es, struct ilist *l)
124{
125	return to_entry(es, l->head);
126}
127
128static struct entry *l_tail(struct entry_space *es, struct ilist *l)
129{
130	return to_entry(es, l->tail);
131}
132
133static struct entry *l_next(struct entry_space *es, struct entry *e)
134{
135	return to_entry(es, e->next);
136}
137
138static struct entry *l_prev(struct entry_space *es, struct entry *e)
139{
140	return to_entry(es, e->prev);
141}
142
143static bool l_empty(struct ilist *l)
144{
145	return l->head == INDEXER_NULL;
146}
147
148static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
149{
150	struct entry *head = l_head(es, l);
151
152	e->next = l->head;
153	e->prev = INDEXER_NULL;
154
155	if (head)
156		head->prev = l->head = to_index(es, e);
157	else
158		l->head = l->tail = to_index(es, e);
159
160	if (!e->sentinel)
161		l->nr_elts++;
162}
163
164static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
165{
166	struct entry *tail = l_tail(es, l);
167
168	e->next = INDEXER_NULL;
169	e->prev = l->tail;
170
171	if (tail)
172		tail->next = l->tail = to_index(es, e);
173	else
174		l->head = l->tail = to_index(es, e);
175
176	if (!e->sentinel)
177		l->nr_elts++;
178}
179
180static void l_add_before(struct entry_space *es, struct ilist *l,
181			 struct entry *old, struct entry *e)
182{
183	struct entry *prev = l_prev(es, old);
184
185	if (!prev)
186		l_add_head(es, l, e);
187
188	else {
189		e->prev = old->prev;
190		e->next = to_index(es, old);
191		prev->next = old->prev = to_index(es, e);
192
193		if (!e->sentinel)
194			l->nr_elts++;
195	}
196}
197
198static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
199{
200	struct entry *prev = l_prev(es, e);
201	struct entry *next = l_next(es, e);
202
203	if (prev)
204		prev->next = e->next;
205	else
206		l->head = e->next;
207
208	if (next)
209		next->prev = e->prev;
210	else
211		l->tail = e->prev;
212
213	if (!e->sentinel)
214		l->nr_elts--;
215}
216
217static struct entry *l_pop_head(struct entry_space *es, struct ilist *l)
218{
219	struct entry *e;
220
221	for (e = l_head(es, l); e; e = l_next(es, e))
222		if (!e->sentinel) {
223			l_del(es, l, e);
224			return e;
225		}
226
227	return NULL;
228}
229
230static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
231{
232	struct entry *e;
233
234	for (e = l_tail(es, l); e; e = l_prev(es, e))
235		if (!e->sentinel) {
236			l_del(es, l, e);
237			return e;
238		}
239
240	return NULL;
241}
242
243/*----------------------------------------------------------------*/
244
245/*
246 * The stochastic-multi-queue is a set of lru lists stacked into levels.
247 * Entries are moved up levels when they are used, which loosely orders the
248 * most accessed entries in the top levels and least in the bottom.  This
249 * structure is *much* better than a single lru list.
250 */
251#define MAX_LEVELS 64u
252
253struct queue {
254	struct entry_space *es;
255
256	unsigned int nr_elts;
257	unsigned int nr_levels;
258	struct ilist qs[MAX_LEVELS];
259
260	/*
261	 * We maintain a count of the number of entries we would like in each
262	 * level.
263	 */
264	unsigned int last_target_nr_elts;
265	unsigned int nr_top_levels;
266	unsigned int nr_in_top_levels;
267	unsigned int target_count[MAX_LEVELS];
268};
269
270static void q_init(struct queue *q, struct entry_space *es, unsigned int nr_levels)
271{
272	unsigned int i;
273
274	q->es = es;
275	q->nr_elts = 0;
276	q->nr_levels = nr_levels;
277
278	for (i = 0; i < q->nr_levels; i++) {
279		l_init(q->qs + i);
280		q->target_count[i] = 0u;
281	}
282
283	q->last_target_nr_elts = 0u;
284	q->nr_top_levels = 0u;
285	q->nr_in_top_levels = 0u;
286}
287
288static unsigned int q_size(struct queue *q)
289{
290	return q->nr_elts;
291}
292
293/*
294 * Insert an entry to the back of the given level.
295 */
296static void q_push(struct queue *q, struct entry *e)
297{
298	BUG_ON(e->pending_work);
299
300	if (!e->sentinel)
301		q->nr_elts++;
302
303	l_add_tail(q->es, q->qs + e->level, e);
304}
305
306static void q_push_front(struct queue *q, struct entry *e)
307{
308	BUG_ON(e->pending_work);
309
310	if (!e->sentinel)
311		q->nr_elts++;
312
313	l_add_head(q->es, q->qs + e->level, e);
314}
315
316static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
317{
318	BUG_ON(e->pending_work);
319
320	if (!e->sentinel)
321		q->nr_elts++;
322
323	l_add_before(q->es, q->qs + e->level, old, e);
324}
325
326static void q_del(struct queue *q, struct entry *e)
327{
328	l_del(q->es, q->qs + e->level, e);
329	if (!e->sentinel)
330		q->nr_elts--;
331}
332
333/*
334 * Return the oldest entry of the lowest populated level.
335 */
336static struct entry *q_peek(struct queue *q, unsigned int max_level, bool can_cross_sentinel)
337{
338	unsigned int level;
339	struct entry *e;
340
341	max_level = min(max_level, q->nr_levels);
342
343	for (level = 0; level < max_level; level++)
344		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
345			if (e->sentinel) {
346				if (can_cross_sentinel)
347					continue;
348				else
349					break;
350			}
351
352			return e;
353		}
354
355	return NULL;
356}
357
358static struct entry *q_pop(struct queue *q)
359{
360	struct entry *e = q_peek(q, q->nr_levels, true);
361
362	if (e)
363		q_del(q, e);
364
365	return e;
366}
367
368/*
369 * This function assumes there is a non-sentinel entry to pop.  It's only
370 * used by redistribute, so we know this is true.  It also doesn't adjust
371 * the q->nr_elts count.
372 */
373static struct entry *__redist_pop_from(struct queue *q, unsigned int level)
374{
375	struct entry *e;
376
377	for (; level < q->nr_levels; level++)
378		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
379			if (!e->sentinel) {
380				l_del(q->es, q->qs + e->level, e);
381				return e;
382			}
383
384	return NULL;
385}
386
387static void q_set_targets_subrange_(struct queue *q, unsigned int nr_elts,
388				    unsigned int lbegin, unsigned int lend)
389{
390	unsigned int level, nr_levels, entries_per_level, remainder;
391
392	BUG_ON(lbegin > lend);
393	BUG_ON(lend > q->nr_levels);
394	nr_levels = lend - lbegin;
395	entries_per_level = safe_div(nr_elts, nr_levels);
396	remainder = safe_mod(nr_elts, nr_levels);
397
398	for (level = lbegin; level < lend; level++)
399		q->target_count[level] =
400			(level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
401}
402
403/*
404 * Typically we have fewer elements in the top few levels which allows us
405 * to adjust the promote threshold nicely.
406 */
407static void q_set_targets(struct queue *q)
408{
409	if (q->last_target_nr_elts == q->nr_elts)
410		return;
411
412	q->last_target_nr_elts = q->nr_elts;
413
414	if (q->nr_top_levels > q->nr_levels)
415		q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
416
417	else {
418		q_set_targets_subrange_(q, q->nr_in_top_levels,
419					q->nr_levels - q->nr_top_levels, q->nr_levels);
420
421		if (q->nr_in_top_levels < q->nr_elts)
422			q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
423						0, q->nr_levels - q->nr_top_levels);
424		else
425			q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
426	}
427}
428
429static void q_redistribute(struct queue *q)
430{
431	unsigned int target, level;
432	struct ilist *l, *l_above;
433	struct entry *e;
434
435	q_set_targets(q);
436
437	for (level = 0u; level < q->nr_levels - 1u; level++) {
438		l = q->qs + level;
439		target = q->target_count[level];
440
441		/*
442		 * Pull down some entries from the level above.
443		 */
444		while (l->nr_elts < target) {
445			e = __redist_pop_from(q, level + 1u);
446			if (!e) {
447				/* bug in nr_elts */
448				break;
449			}
450
451			e->level = level;
452			l_add_tail(q->es, l, e);
453		}
454
455		/*
456		 * Push some entries up.
457		 */
458		l_above = q->qs + level + 1u;
459		while (l->nr_elts > target) {
460			e = l_pop_tail(q->es, l);
461
462			if (!e)
463				/* bug in nr_elts */
464				break;
465
466			e->level = level + 1u;
467			l_add_tail(q->es, l_above, e);
468		}
469	}
470}
471
472static void q_requeue(struct queue *q, struct entry *e, unsigned int extra_levels,
473		      struct entry *s1, struct entry *s2)
474{
475	struct entry *de;
476	unsigned int sentinels_passed = 0;
477	unsigned int new_level = min(q->nr_levels - 1u, e->level + extra_levels);
478
479	/* try and find an entry to swap with */
480	if (extra_levels && (e->level < q->nr_levels - 1u)) {
481		for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
482			sentinels_passed++;
483
484		if (de) {
485			q_del(q, de);
486			de->level = e->level;
487			if (s1) {
488				switch (sentinels_passed) {
489				case 0:
490					q_push_before(q, s1, de);
491					break;
492
493				case 1:
494					q_push_before(q, s2, de);
495					break;
496
497				default:
498					q_push(q, de);
499				}
500			} else
501				q_push(q, de);
502		}
503	}
504
505	q_del(q, e);
506	e->level = new_level;
507	q_push(q, e);
508}
509
510/*----------------------------------------------------------------*/
511
512#define FP_SHIFT 8
513#define SIXTEENTH (1u << (FP_SHIFT - 4u))
514#define EIGHTH (1u << (FP_SHIFT - 3u))
515
516struct stats {
517	unsigned int hit_threshold;
518	unsigned int hits;
519	unsigned int misses;
520};
521
522enum performance {
523	Q_POOR,
524	Q_FAIR,
525	Q_WELL
526};
527
528static void stats_init(struct stats *s, unsigned int nr_levels)
529{
530	s->hit_threshold = (nr_levels * 3u) / 4u;
531	s->hits = 0u;
532	s->misses = 0u;
533}
534
535static void stats_reset(struct stats *s)
536{
537	s->hits = s->misses = 0u;
538}
539
540static void stats_level_accessed(struct stats *s, unsigned int level)
541{
542	if (level >= s->hit_threshold)
543		s->hits++;
544	else
545		s->misses++;
546}
547
548static void stats_miss(struct stats *s)
549{
550	s->misses++;
551}
552
553/*
554 * There are times when we don't have any confidence in the hotspot queue.
555 * Such as when a fresh cache is created and the blocks have been spread
556 * out across the levels, or if an io load changes.  We detect this by
557 * seeing how often a lookup is in the top levels of the hotspot queue.
558 */
559static enum performance stats_assess(struct stats *s)
560{
561	unsigned int confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
562
563	if (confidence < SIXTEENTH)
564		return Q_POOR;
565
566	else if (confidence < EIGHTH)
567		return Q_FAIR;
568
569	else
570		return Q_WELL;
571}
572
573/*----------------------------------------------------------------*/
574
575struct smq_hash_table {
576	struct entry_space *es;
577	unsigned long long hash_bits;
578	unsigned int *buckets;
579};
580
581/*
582 * All cache entries are stored in a chained hash table.  To save space we
583 * use indexing again, and only store indexes to the next entry.
584 */
585static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned int nr_entries)
586{
587	unsigned int i, nr_buckets;
588
589	ht->es = es;
590	nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
591	ht->hash_bits = __ffs(nr_buckets);
592
593	ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
594	if (!ht->buckets)
595		return -ENOMEM;
596
597	for (i = 0; i < nr_buckets; i++)
598		ht->buckets[i] = INDEXER_NULL;
599
600	return 0;
601}
602
603static void h_exit(struct smq_hash_table *ht)
604{
605	vfree(ht->buckets);
606}
607
608static struct entry *h_head(struct smq_hash_table *ht, unsigned int bucket)
609{
610	return to_entry(ht->es, ht->buckets[bucket]);
611}
612
613static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
614{
615	return to_entry(ht->es, e->hash_next);
616}
617
618static void __h_insert(struct smq_hash_table *ht, unsigned int bucket, struct entry *e)
619{
620	e->hash_next = ht->buckets[bucket];
621	ht->buckets[bucket] = to_index(ht->es, e);
622}
623
624static void h_insert(struct smq_hash_table *ht, struct entry *e)
625{
626	unsigned int h = hash_64(from_oblock(e->oblock), ht->hash_bits);
627
628	__h_insert(ht, h, e);
629}
630
631static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned int h, dm_oblock_t oblock,
632				struct entry **prev)
633{
634	struct entry *e;
635
636	*prev = NULL;
637	for (e = h_head(ht, h); e; e = h_next(ht, e)) {
638		if (e->oblock == oblock)
639			return e;
640
641		*prev = e;
642	}
643
644	return NULL;
645}
646
647static void __h_unlink(struct smq_hash_table *ht, unsigned int h,
648		       struct entry *e, struct entry *prev)
649{
650	if (prev)
651		prev->hash_next = e->hash_next;
652	else
653		ht->buckets[h] = e->hash_next;
654}
655
656/*
657 * Also moves each entry to the front of the bucket.
658 */
659static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
660{
661	struct entry *e, *prev;
662	unsigned int h = hash_64(from_oblock(oblock), ht->hash_bits);
663
664	e = __h_lookup(ht, h, oblock, &prev);
665	if (e && prev) {
666		/*
667		 * Move to the front because this entry is likely
668		 * to be hit again.
669		 */
670		__h_unlink(ht, h, e, prev);
671		__h_insert(ht, h, e);
672	}
673
674	return e;
675}
676
677static void h_remove(struct smq_hash_table *ht, struct entry *e)
678{
679	unsigned int h = hash_64(from_oblock(e->oblock), ht->hash_bits);
680	struct entry *prev;
681
682	/*
683	 * The down side of using a singly linked list is we have to
684	 * iterate the bucket to remove an item.
685	 */
686	e = __h_lookup(ht, h, e->oblock, &prev);
687	if (e)
688		__h_unlink(ht, h, e, prev);
689}
690
691/*----------------------------------------------------------------*/
692
693struct entry_alloc {
694	struct entry_space *es;
695	unsigned int begin;
696
697	unsigned int nr_allocated;
698	struct ilist free;
699};
700
701static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
702			   unsigned int begin, unsigned int end)
703{
704	unsigned int i;
705
706	ea->es = es;
707	ea->nr_allocated = 0u;
708	ea->begin = begin;
709
710	l_init(&ea->free);
711	for (i = begin; i != end; i++)
712		l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
713}
714
715static void init_entry(struct entry *e)
716{
717	/*
718	 * We can't memset because that would clear the hotspot and
719	 * sentinel bits which remain constant.
720	 */
721	e->hash_next = INDEXER_NULL;
722	e->next = INDEXER_NULL;
723	e->prev = INDEXER_NULL;
724	e->level = 0u;
725	e->dirty = true;	/* FIXME: audit */
726	e->allocated = true;
727	e->sentinel = false;
728	e->pending_work = false;
729}
730
731static struct entry *alloc_entry(struct entry_alloc *ea)
732{
733	struct entry *e;
734
735	if (l_empty(&ea->free))
736		return NULL;
737
738	e = l_pop_head(ea->es, &ea->free);
739	init_entry(e);
740	ea->nr_allocated++;
741
742	return e;
743}
744
745/*
746 * This assumes the cblock hasn't already been allocated.
747 */
748static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned int i)
749{
750	struct entry *e = __get_entry(ea->es, ea->begin + i);
751
752	BUG_ON(e->allocated);
753
754	l_del(ea->es, &ea->free, e);
755	init_entry(e);
756	ea->nr_allocated++;
757
758	return e;
759}
760
761static void free_entry(struct entry_alloc *ea, struct entry *e)
762{
763	BUG_ON(!ea->nr_allocated);
764	BUG_ON(!e->allocated);
765
766	ea->nr_allocated--;
767	e->allocated = false;
768	l_add_tail(ea->es, &ea->free, e);
769}
770
771static bool allocator_empty(struct entry_alloc *ea)
772{
773	return l_empty(&ea->free);
774}
775
776static unsigned int get_index(struct entry_alloc *ea, struct entry *e)
777{
778	return to_index(ea->es, e) - ea->begin;
779}
780
781static struct entry *get_entry(struct entry_alloc *ea, unsigned int index)
782{
783	return __get_entry(ea->es, ea->begin + index);
784}
785
786/*----------------------------------------------------------------*/
787
788#define NR_HOTSPOT_LEVELS 64u
789#define NR_CACHE_LEVELS 64u
790
791#define WRITEBACK_PERIOD (10ul * HZ)
792#define DEMOTE_PERIOD (60ul * HZ)
793
794#define HOTSPOT_UPDATE_PERIOD (HZ)
795#define CACHE_UPDATE_PERIOD (60ul * HZ)
796
797struct smq_policy {
798	struct dm_cache_policy policy;
799
800	/* protects everything */
801	spinlock_t lock;
802	dm_cblock_t cache_size;
803	sector_t cache_block_size;
804
805	sector_t hotspot_block_size;
806	unsigned int nr_hotspot_blocks;
807	unsigned int cache_blocks_per_hotspot_block;
808	unsigned int hotspot_level_jump;
809
810	struct entry_space es;
811	struct entry_alloc writeback_sentinel_alloc;
812	struct entry_alloc demote_sentinel_alloc;
813	struct entry_alloc hotspot_alloc;
814	struct entry_alloc cache_alloc;
815
816	unsigned long *hotspot_hit_bits;
817	unsigned long *cache_hit_bits;
818
819	/*
820	 * We maintain three queues of entries.  The cache proper,
821	 * consisting of a clean and dirty queue, containing the currently
822	 * active mappings.  The hotspot queue uses a larger block size to
823	 * track blocks that are being hit frequently and potential
824	 * candidates for promotion to the cache.
825	 */
826	struct queue hotspot;
827	struct queue clean;
828	struct queue dirty;
829
830	struct stats hotspot_stats;
831	struct stats cache_stats;
832
833	/*
834	 * Keeps track of time, incremented by the core.  We use this to
835	 * avoid attributing multiple hits within the same tick.
836	 */
837	unsigned int tick;
838
839	/*
840	 * The hash tables allows us to quickly find an entry by origin
841	 * block.
842	 */
843	struct smq_hash_table table;
844	struct smq_hash_table hotspot_table;
845
846	bool current_writeback_sentinels;
847	unsigned long next_writeback_period;
848
849	bool current_demote_sentinels;
850	unsigned long next_demote_period;
851
852	unsigned int write_promote_level;
853	unsigned int read_promote_level;
854
855	unsigned long next_hotspot_period;
856	unsigned long next_cache_period;
857
858	struct background_tracker *bg_work;
859
860	bool migrations_allowed:1;
861
862	/*
863	 * If this is set the policy will try and clean the whole cache
864	 * even if the device is not idle.
865	 */
866	bool cleaner:1;
867};
868
869/*----------------------------------------------------------------*/
870
871static struct entry *get_sentinel(struct entry_alloc *ea, unsigned int level, bool which)
872{
873	return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
874}
875
876static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned int level)
877{
878	return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
879}
880
881static struct entry *demote_sentinel(struct smq_policy *mq, unsigned int level)
882{
883	return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
884}
885
886static void __update_writeback_sentinels(struct smq_policy *mq)
887{
888	unsigned int level;
889	struct queue *q = &mq->dirty;
890	struct entry *sentinel;
891
892	for (level = 0; level < q->nr_levels; level++) {
893		sentinel = writeback_sentinel(mq, level);
894		q_del(q, sentinel);
895		q_push(q, sentinel);
896	}
897}
898
899static void __update_demote_sentinels(struct smq_policy *mq)
900{
901	unsigned int level;
902	struct queue *q = &mq->clean;
903	struct entry *sentinel;
904
905	for (level = 0; level < q->nr_levels; level++) {
906		sentinel = demote_sentinel(mq, level);
907		q_del(q, sentinel);
908		q_push(q, sentinel);
909	}
910}
911
912static void update_sentinels(struct smq_policy *mq)
913{
914	if (time_after(jiffies, mq->next_writeback_period)) {
915		mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
916		mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
917		__update_writeback_sentinels(mq);
918	}
919
920	if (time_after(jiffies, mq->next_demote_period)) {
921		mq->next_demote_period = jiffies + DEMOTE_PERIOD;
922		mq->current_demote_sentinels = !mq->current_demote_sentinels;
923		__update_demote_sentinels(mq);
924	}
925}
926
927static void __sentinels_init(struct smq_policy *mq)
928{
929	unsigned int level;
930	struct entry *sentinel;
931
932	for (level = 0; level < NR_CACHE_LEVELS; level++) {
933		sentinel = writeback_sentinel(mq, level);
934		sentinel->level = level;
935		q_push(&mq->dirty, sentinel);
936
937		sentinel = demote_sentinel(mq, level);
938		sentinel->level = level;
939		q_push(&mq->clean, sentinel);
940	}
941}
942
943static void sentinels_init(struct smq_policy *mq)
944{
945	mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
946	mq->next_demote_period = jiffies + DEMOTE_PERIOD;
947
948	mq->current_writeback_sentinels = false;
949	mq->current_demote_sentinels = false;
950	__sentinels_init(mq);
951
952	mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
953	mq->current_demote_sentinels = !mq->current_demote_sentinels;
954	__sentinels_init(mq);
955}
956
957/*----------------------------------------------------------------*/
958
959static void del_queue(struct smq_policy *mq, struct entry *e)
960{
961	q_del(e->dirty ? &mq->dirty : &mq->clean, e);
962}
963
964static void push_queue(struct smq_policy *mq, struct entry *e)
965{
966	if (e->dirty)
967		q_push(&mq->dirty, e);
968	else
969		q_push(&mq->clean, e);
970}
971
972// !h, !q, a -> h, q, a
973static void push(struct smq_policy *mq, struct entry *e)
974{
975	h_insert(&mq->table, e);
976	if (!e->pending_work)
977		push_queue(mq, e);
978}
979
980static void push_queue_front(struct smq_policy *mq, struct entry *e)
981{
982	if (e->dirty)
983		q_push_front(&mq->dirty, e);
984	else
985		q_push_front(&mq->clean, e);
986}
987
988static void push_front(struct smq_policy *mq, struct entry *e)
989{
990	h_insert(&mq->table, e);
991	if (!e->pending_work)
992		push_queue_front(mq, e);
993}
994
995static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
996{
997	return to_cblock(get_index(&mq->cache_alloc, e));
998}
999
1000static void requeue(struct smq_policy *mq, struct entry *e)
1001{
1002	/*
1003	 * Pending work has temporarily been taken out of the queues.
1004	 */
1005	if (e->pending_work)
1006		return;
1007
1008	if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
1009		if (!e->dirty) {
1010			q_requeue(&mq->clean, e, 1u, NULL, NULL);
1011			return;
1012		}
1013
1014		q_requeue(&mq->dirty, e, 1u,
1015			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
1016			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
1017	}
1018}
1019
1020static unsigned int default_promote_level(struct smq_policy *mq)
1021{
1022	/*
1023	 * The promote level depends on the current performance of the
1024	 * cache.
1025	 *
1026	 * If the cache is performing badly, then we can't afford
1027	 * to promote much without causing performance to drop below that
1028	 * of the origin device.
1029	 *
1030	 * If the cache is performing well, then we don't need to promote
1031	 * much.  If it isn't broken, don't fix it.
1032	 *
1033	 * If the cache is middling then we promote more.
1034	 *
1035	 * This scheme reminds me of a graph of entropy vs probability of a
1036	 * binary variable.
1037	 */
1038	static const unsigned int table[] = {
1039		1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1
1040	};
1041
1042	unsigned int hits = mq->cache_stats.hits;
1043	unsigned int misses = mq->cache_stats.misses;
1044	unsigned int index = safe_div(hits << 4u, hits + misses);
1045	return table[index];
1046}
1047
1048static void update_promote_levels(struct smq_policy *mq)
1049{
1050	/*
1051	 * If there are unused cache entries then we want to be really
1052	 * eager to promote.
1053	 */
1054	unsigned int threshold_level = allocator_empty(&mq->cache_alloc) ?
1055		default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
1056
1057	threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
1058
1059	/*
1060	 * If the hotspot queue is performing badly then we have little
1061	 * confidence that we know which blocks to promote.  So we cut down
1062	 * the amount of promotions.
1063	 */
1064	switch (stats_assess(&mq->hotspot_stats)) {
1065	case Q_POOR:
1066		threshold_level /= 4u;
1067		break;
1068
1069	case Q_FAIR:
1070		threshold_level /= 2u;
1071		break;
1072
1073	case Q_WELL:
1074		break;
1075	}
1076
1077	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
1078	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
1079}
1080
1081/*
1082 * If the hotspot queue is performing badly, then we try and move entries
1083 * around more quickly.
1084 */
1085static void update_level_jump(struct smq_policy *mq)
1086{
1087	switch (stats_assess(&mq->hotspot_stats)) {
1088	case Q_POOR:
1089		mq->hotspot_level_jump = 4u;
1090		break;
1091
1092	case Q_FAIR:
1093		mq->hotspot_level_jump = 2u;
1094		break;
1095
1096	case Q_WELL:
1097		mq->hotspot_level_jump = 1u;
1098		break;
1099	}
1100}
1101
1102static void end_hotspot_period(struct smq_policy *mq)
1103{
1104	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1105	update_promote_levels(mq);
1106
1107	if (time_after(jiffies, mq->next_hotspot_period)) {
1108		update_level_jump(mq);
1109		q_redistribute(&mq->hotspot);
1110		stats_reset(&mq->hotspot_stats);
1111		mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
1112	}
1113}
1114
1115static void end_cache_period(struct smq_policy *mq)
1116{
1117	if (time_after(jiffies, mq->next_cache_period)) {
1118		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1119
1120		q_redistribute(&mq->dirty);
1121		q_redistribute(&mq->clean);
1122		stats_reset(&mq->cache_stats);
1123
1124		mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
1125	}
1126}
1127
1128/*----------------------------------------------------------------*/
1129
1130/*
1131 * Targets are given as a percentage.
1132 */
1133#define CLEAN_TARGET 25u
1134#define FREE_TARGET 25u
1135
1136static unsigned int percent_to_target(struct smq_policy *mq, unsigned int p)
1137{
1138	return from_cblock(mq->cache_size) * p / 100u;
1139}
1140
1141static bool clean_target_met(struct smq_policy *mq, bool idle)
1142{
1143	/*
1144	 * Cache entries may not be populated.  So we cannot rely on the
1145	 * size of the clean queue.
1146	 */
1147	if (idle || mq->cleaner) {
1148		/*
1149		 * We'd like to clean everything.
1150		 */
1151		return q_size(&mq->dirty) == 0u;
1152	}
1153
1154	/*
1155	 * If we're busy we don't worry about cleaning at all.
1156	 */
1157	return true;
1158}
1159
1160static bool free_target_met(struct smq_policy *mq)
1161{
1162	unsigned int nr_free;
1163
1164	nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
1165	return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
1166		percent_to_target(mq, FREE_TARGET);
1167}
1168
1169/*----------------------------------------------------------------*/
1170
1171static void mark_pending(struct smq_policy *mq, struct entry *e)
1172{
1173	BUG_ON(e->sentinel);
1174	BUG_ON(!e->allocated);
1175	BUG_ON(e->pending_work);
1176	e->pending_work = true;
1177}
1178
1179static void clear_pending(struct smq_policy *mq, struct entry *e)
1180{
1181	BUG_ON(!e->pending_work);
1182	e->pending_work = false;
1183}
1184
1185static void queue_writeback(struct smq_policy *mq, bool idle)
1186{
1187	int r;
1188	struct policy_work work;
1189	struct entry *e;
1190
1191	e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle);
1192	if (e) {
1193		mark_pending(mq, e);
1194		q_del(&mq->dirty, e);
1195
1196		work.op = POLICY_WRITEBACK;
1197		work.oblock = e->oblock;
1198		work.cblock = infer_cblock(mq, e);
1199
1200		r = btracker_queue(mq->bg_work, &work, NULL);
1201		if (r) {
1202			clear_pending(mq, e);
1203			q_push_front(&mq->dirty, e);
1204		}
1205	}
1206}
1207
1208static void queue_demotion(struct smq_policy *mq)
1209{
1210	int r;
1211	struct policy_work work;
1212	struct entry *e;
1213
1214	if (WARN_ON_ONCE(!mq->migrations_allowed))
1215		return;
1216
1217	e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
1218	if (!e) {
1219		if (!clean_target_met(mq, true))
1220			queue_writeback(mq, false);
1221		return;
1222	}
1223
1224	mark_pending(mq, e);
1225	q_del(&mq->clean, e);
1226
1227	work.op = POLICY_DEMOTE;
1228	work.oblock = e->oblock;
1229	work.cblock = infer_cblock(mq, e);
1230	r = btracker_queue(mq->bg_work, &work, NULL);
1231	if (r) {
1232		clear_pending(mq, e);
1233		q_push_front(&mq->clean, e);
1234	}
1235}
1236
1237static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
1238			    struct policy_work **workp)
1239{
1240	int r;
1241	struct entry *e;
1242	struct policy_work work;
1243
1244	if (!mq->migrations_allowed)
1245		return;
1246
1247	if (allocator_empty(&mq->cache_alloc)) {
1248		/*
1249		 * We always claim to be 'idle' to ensure some demotions happen
1250		 * with continuous loads.
1251		 */
1252		if (!free_target_met(mq))
1253			queue_demotion(mq);
1254		return;
1255	}
1256
1257	if (btracker_promotion_already_present(mq->bg_work, oblock))
1258		return;
1259
1260	/*
1261	 * We allocate the entry now to reserve the cblock.  If the
1262	 * background work is aborted we must remember to free it.
1263	 */
1264	e = alloc_entry(&mq->cache_alloc);
1265	BUG_ON(!e);
1266	e->pending_work = true;
1267	work.op = POLICY_PROMOTE;
1268	work.oblock = oblock;
1269	work.cblock = infer_cblock(mq, e);
1270	r = btracker_queue(mq->bg_work, &work, workp);
1271	if (r)
1272		free_entry(&mq->cache_alloc, e);
1273}
1274
1275/*----------------------------------------------------------------*/
1276
1277enum promote_result {
1278	PROMOTE_NOT,
1279	PROMOTE_TEMPORARY,
1280	PROMOTE_PERMANENT
1281};
1282
1283/*
1284 * Converts a boolean into a promote result.
1285 */
1286static enum promote_result maybe_promote(bool promote)
1287{
1288	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
1289}
1290
1291static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
1292					  int data_dir, bool fast_promote)
1293{
1294	if (data_dir == WRITE) {
1295		if (!allocator_empty(&mq->cache_alloc) && fast_promote)
1296			return PROMOTE_TEMPORARY;
1297
1298		return maybe_promote(hs_e->level >= mq->write_promote_level);
1299	} else
1300		return maybe_promote(hs_e->level >= mq->read_promote_level);
1301}
1302
1303static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1304{
1305	sector_t r = from_oblock(b);
1306	(void) sector_div(r, mq->cache_blocks_per_hotspot_block);
1307	return to_oblock(r);
1308}
1309
1310static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
1311{
1312	unsigned int hi;
1313	dm_oblock_t hb = to_hblock(mq, b);
1314	struct entry *e = h_lookup(&mq->hotspot_table, hb);
1315
1316	if (e) {
1317		stats_level_accessed(&mq->hotspot_stats, e->level);
1318
1319		hi = get_index(&mq->hotspot_alloc, e);
1320		q_requeue(&mq->hotspot, e,
1321			  test_and_set_bit(hi, mq->hotspot_hit_bits) ?
1322			  0u : mq->hotspot_level_jump,
1323			  NULL, NULL);
1324
1325	} else {
1326		stats_miss(&mq->hotspot_stats);
1327
1328		e = alloc_entry(&mq->hotspot_alloc);
1329		if (!e) {
1330			e = q_pop(&mq->hotspot);
1331			if (e) {
1332				h_remove(&mq->hotspot_table, e);
1333				hi = get_index(&mq->hotspot_alloc, e);
1334				clear_bit(hi, mq->hotspot_hit_bits);
1335			}
1336
1337		}
1338
1339		if (e) {
1340			e->oblock = hb;
1341			q_push(&mq->hotspot, e);
1342			h_insert(&mq->hotspot_table, e);
1343		}
1344	}
1345
1346	return e;
1347}
1348
1349/*----------------------------------------------------------------*/
1350
1351/*
1352 * Public interface, via the policy struct.  See dm-cache-policy.h for a
1353 * description of these.
1354 */
1355
1356static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
1357{
1358	return container_of(p, struct smq_policy, policy);
1359}
1360
1361static void smq_destroy(struct dm_cache_policy *p)
1362{
1363	struct smq_policy *mq = to_smq_policy(p);
1364
1365	btracker_destroy(mq->bg_work);
1366	h_exit(&mq->hotspot_table);
1367	h_exit(&mq->table);
1368	free_bitset(mq->hotspot_hit_bits);
1369	free_bitset(mq->cache_hit_bits);
1370	space_exit(&mq->es);
1371	kfree(mq);
1372}
1373
1374/*----------------------------------------------------------------*/
1375
1376static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
1377		    int data_dir, bool fast_copy,
1378		    struct policy_work **work, bool *background_work)
1379{
1380	struct entry *e, *hs_e;
1381	enum promote_result pr;
1382
1383	*background_work = false;
1384
1385	e = h_lookup(&mq->table, oblock);
1386	if (e) {
1387		stats_level_accessed(&mq->cache_stats, e->level);
1388
1389		requeue(mq, e);
1390		*cblock = infer_cblock(mq, e);
1391		return 0;
1392
1393	} else {
1394		stats_miss(&mq->cache_stats);
1395
1396		/*
1397		 * The hotspot queue only gets updated with misses.
1398		 */
1399		hs_e = update_hotspot_queue(mq, oblock);
1400
1401		pr = should_promote(mq, hs_e, data_dir, fast_copy);
1402		if (pr != PROMOTE_NOT) {
1403			queue_promotion(mq, oblock, work);
1404			*background_work = true;
1405		}
1406
1407		return -ENOENT;
1408	}
1409}
1410
1411static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
1412		      int data_dir, bool fast_copy,
1413		      bool *background_work)
1414{
1415	int r;
1416	unsigned long flags;
1417	struct smq_policy *mq = to_smq_policy(p);
1418
1419	spin_lock_irqsave(&mq->lock, flags);
1420	r = __lookup(mq, oblock, cblock,
1421		     data_dir, fast_copy,
1422		     NULL, background_work);
1423	spin_unlock_irqrestore(&mq->lock, flags);
1424
1425	return r;
1426}
1427
1428static int smq_lookup_with_work(struct dm_cache_policy *p,
1429				dm_oblock_t oblock, dm_cblock_t *cblock,
1430				int data_dir, bool fast_copy,
1431				struct policy_work **work)
1432{
1433	int r;
1434	bool background_queued;
1435	unsigned long flags;
1436	struct smq_policy *mq = to_smq_policy(p);
1437
1438	spin_lock_irqsave(&mq->lock, flags);
1439	r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
1440	spin_unlock_irqrestore(&mq->lock, flags);
1441
1442	return r;
1443}
1444
1445static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
1446				   struct policy_work **result)
1447{
1448	int r;
1449	unsigned long flags;
1450	struct smq_policy *mq = to_smq_policy(p);
1451
1452	spin_lock_irqsave(&mq->lock, flags);
1453	r = btracker_issue(mq->bg_work, result);
1454	if (r == -ENODATA) {
1455		if (!clean_target_met(mq, idle)) {
1456			queue_writeback(mq, idle);
1457			r = btracker_issue(mq->bg_work, result);
1458		}
1459	}
1460	spin_unlock_irqrestore(&mq->lock, flags);
1461
1462	return r;
1463}
1464
1465/*
1466 * We need to clear any pending work flags that have been set, and in the
1467 * case of promotion free the entry for the destination cblock.
1468 */
1469static void __complete_background_work(struct smq_policy *mq,
1470				       struct policy_work *work,
1471				       bool success)
1472{
1473	struct entry *e = get_entry(&mq->cache_alloc,
1474				    from_cblock(work->cblock));
1475
1476	switch (work->op) {
1477	case POLICY_PROMOTE:
1478		// !h, !q, a
1479		clear_pending(mq, e);
1480		if (success) {
1481			e->oblock = work->oblock;
1482			e->level = NR_CACHE_LEVELS - 1;
1483			push(mq, e);
1484			// h, q, a
1485		} else {
1486			free_entry(&mq->cache_alloc, e);
1487			// !h, !q, !a
1488		}
1489		break;
1490
1491	case POLICY_DEMOTE:
1492		// h, !q, a
1493		if (success) {
1494			h_remove(&mq->table, e);
1495			free_entry(&mq->cache_alloc, e);
1496			// !h, !q, !a
1497		} else {
1498			clear_pending(mq, e);
1499			push_queue(mq, e);
1500			// h, q, a
1501		}
1502		break;
1503
1504	case POLICY_WRITEBACK:
1505		// h, !q, a
1506		clear_pending(mq, e);
1507		push_queue(mq, e);
1508		// h, q, a
1509		break;
1510	}
1511
1512	btracker_complete(mq->bg_work, work);
1513}
1514
1515static void smq_complete_background_work(struct dm_cache_policy *p,
1516					 struct policy_work *work,
1517					 bool success)
1518{
1519	unsigned long flags;
1520	struct smq_policy *mq = to_smq_policy(p);
1521
1522	spin_lock_irqsave(&mq->lock, flags);
1523	__complete_background_work(mq, work, success);
1524	spin_unlock_irqrestore(&mq->lock, flags);
1525}
1526
1527// in_hash(oblock) -> in_hash(oblock)
1528static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
1529{
1530	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1531
1532	if (e->pending_work)
1533		e->dirty = set;
1534	else {
1535		del_queue(mq, e);
1536		e->dirty = set;
1537		push_queue(mq, e);
1538	}
1539}
1540
1541static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1542{
1543	unsigned long flags;
1544	struct smq_policy *mq = to_smq_policy(p);
1545
1546	spin_lock_irqsave(&mq->lock, flags);
1547	__smq_set_clear_dirty(mq, cblock, true);
1548	spin_unlock_irqrestore(&mq->lock, flags);
1549}
1550
1551static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1552{
1553	struct smq_policy *mq = to_smq_policy(p);
1554	unsigned long flags;
1555
1556	spin_lock_irqsave(&mq->lock, flags);
1557	__smq_set_clear_dirty(mq, cblock, false);
1558	spin_unlock_irqrestore(&mq->lock, flags);
1559}
1560
1561static unsigned int random_level(dm_cblock_t cblock)
1562{
1563	return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
1564}
1565
1566static int smq_load_mapping(struct dm_cache_policy *p,
1567			    dm_oblock_t oblock, dm_cblock_t cblock,
1568			    bool dirty, uint32_t hint, bool hint_valid)
1569{
1570	struct smq_policy *mq = to_smq_policy(p);
1571	struct entry *e;
1572
1573	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
1574	e->oblock = oblock;
1575	e->dirty = dirty;
1576	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
1577	e->pending_work = false;
1578
1579	/*
1580	 * When we load mappings we push ahead of both sentinels in order to
1581	 * allow demotions and cleaning to occur immediately.
1582	 */
1583	push_front(mq, e);
1584
1585	return 0;
1586}
1587
1588static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
1589{
1590	struct smq_policy *mq = to_smq_policy(p);
1591	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1592
1593	if (!e->allocated)
1594		return -ENODATA;
1595
1596	// FIXME: what if this block has pending background work?
1597	del_queue(mq, e);
1598	h_remove(&mq->table, e);
1599	free_entry(&mq->cache_alloc, e);
1600	return 0;
1601}
1602
1603static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
1604{
1605	struct smq_policy *mq = to_smq_policy(p);
1606	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1607
1608	if (!e->allocated)
1609		return 0;
1610
1611	return e->level;
1612}
1613
1614static dm_cblock_t smq_residency(struct dm_cache_policy *p)
1615{
1616	dm_cblock_t r;
1617	unsigned long flags;
1618	struct smq_policy *mq = to_smq_policy(p);
1619
1620	spin_lock_irqsave(&mq->lock, flags);
1621	r = to_cblock(mq->cache_alloc.nr_allocated);
1622	spin_unlock_irqrestore(&mq->lock, flags);
1623
1624	return r;
1625}
1626
1627static void smq_tick(struct dm_cache_policy *p, bool can_block)
1628{
1629	struct smq_policy *mq = to_smq_policy(p);
1630	unsigned long flags;
1631
1632	spin_lock_irqsave(&mq->lock, flags);
1633	mq->tick++;
1634	update_sentinels(mq);
1635	end_hotspot_period(mq);
1636	end_cache_period(mq);
1637	spin_unlock_irqrestore(&mq->lock, flags);
1638}
1639
1640static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
1641{
1642	struct smq_policy *mq = to_smq_policy(p);
1643
1644	mq->migrations_allowed = allow;
1645}
1646
1647/*
1648 * smq has no config values, but the old mq policy did.  To avoid breaking
1649 * software we continue to accept these configurables for the mq policy,
1650 * but they have no effect.
1651 */
1652static int mq_set_config_value(struct dm_cache_policy *p,
1653			       const char *key, const char *value)
1654{
1655	unsigned long tmp;
1656
1657	if (kstrtoul(value, 10, &tmp))
1658		return -EINVAL;
1659
1660	if (!strcasecmp(key, "random_threshold") ||
1661	    !strcasecmp(key, "sequential_threshold") ||
1662	    !strcasecmp(key, "discard_promote_adjustment") ||
1663	    !strcasecmp(key, "read_promote_adjustment") ||
1664	    !strcasecmp(key, "write_promote_adjustment")) {
1665		DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
1666		return 0;
1667	}
1668
1669	return -EINVAL;
1670}
1671
1672static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
1673				 unsigned int maxlen, ssize_t *sz_ptr)
1674{
1675	ssize_t sz = *sz_ptr;
1676
1677	DMEMIT("10 random_threshold 0 "
1678	       "sequential_threshold 0 "
1679	       "discard_promote_adjustment 0 "
1680	       "read_promote_adjustment 0 "
1681	       "write_promote_adjustment 0 ");
1682
1683	*sz_ptr = sz;
1684	return 0;
1685}
1686
1687/* Init the policy plugin interface function pointers. */
1688static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
1689{
1690	mq->policy.destroy = smq_destroy;
1691	mq->policy.lookup = smq_lookup;
1692	mq->policy.lookup_with_work = smq_lookup_with_work;
1693	mq->policy.get_background_work = smq_get_background_work;
1694	mq->policy.complete_background_work = smq_complete_background_work;
1695	mq->policy.set_dirty = smq_set_dirty;
1696	mq->policy.clear_dirty = smq_clear_dirty;
1697	mq->policy.load_mapping = smq_load_mapping;
1698	mq->policy.invalidate_mapping = smq_invalidate_mapping;
1699	mq->policy.get_hint = smq_get_hint;
1700	mq->policy.residency = smq_residency;
1701	mq->policy.tick = smq_tick;
1702	mq->policy.allow_migrations = smq_allow_migrations;
1703
1704	if (mimic_mq) {
1705		mq->policy.set_config_value = mq_set_config_value;
1706		mq->policy.emit_config_values = mq_emit_config_values;
1707	}
1708}
1709
1710static bool too_many_hotspot_blocks(sector_t origin_size,
1711				    sector_t hotspot_block_size,
1712				    unsigned int nr_hotspot_blocks)
1713{
1714	return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
1715}
1716
1717static void calc_hotspot_params(sector_t origin_size,
1718				sector_t cache_block_size,
1719				unsigned int nr_cache_blocks,
1720				sector_t *hotspot_block_size,
1721				unsigned int *nr_hotspot_blocks)
1722{
1723	*hotspot_block_size = cache_block_size * 16u;
1724	*nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
1725
1726	while ((*hotspot_block_size > cache_block_size) &&
1727	       too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
1728		*hotspot_block_size /= 2u;
1729}
1730
1731static struct dm_cache_policy *
1732__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size,
1733	     bool mimic_mq, bool migrations_allowed, bool cleaner)
1734{
1735	unsigned int i;
1736	unsigned int nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
1737	unsigned int total_sentinels = 2u * nr_sentinels_per_queue;
1738	struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1739
1740	if (!mq)
1741		return NULL;
1742
1743	init_policy_functions(mq, mimic_mq);
1744	mq->cache_size = cache_size;
1745	mq->cache_block_size = cache_block_size;
1746
1747	calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
1748			    &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
1749
1750	mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
1751	mq->hotspot_level_jump = 1u;
1752	if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
1753		DMERR("couldn't initialize entry space");
1754		goto bad_pool_init;
1755	}
1756
1757	init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
1758	for (i = 0; i < nr_sentinels_per_queue; i++)
1759		get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
1760
1761	init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
1762	for (i = 0; i < nr_sentinels_per_queue; i++)
1763		get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
1764
1765	init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
1766		       total_sentinels + mq->nr_hotspot_blocks);
1767
1768	init_allocator(&mq->cache_alloc, &mq->es,
1769		       total_sentinels + mq->nr_hotspot_blocks,
1770		       total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
1771
1772	mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
1773	if (!mq->hotspot_hit_bits) {
1774		DMERR("couldn't allocate hotspot hit bitset");
1775		goto bad_hotspot_hit_bits;
1776	}
1777	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1778
1779	if (from_cblock(cache_size)) {
1780		mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
1781		if (!mq->cache_hit_bits) {
1782			DMERR("couldn't allocate cache hit bitset");
1783			goto bad_cache_hit_bits;
1784		}
1785		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1786	} else
1787		mq->cache_hit_bits = NULL;
1788
1789	mq->tick = 0;
1790	spin_lock_init(&mq->lock);
1791
1792	q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
1793	mq->hotspot.nr_top_levels = 8;
1794	mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
1795					   from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
1796
1797	q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
1798	q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
1799
1800	stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
1801	stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
1802
1803	if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
1804		goto bad_alloc_table;
1805
1806	if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
1807		goto bad_alloc_hotspot_table;
1808
1809	sentinels_init(mq);
1810	mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
1811
1812	mq->next_hotspot_period = jiffies;
1813	mq->next_cache_period = jiffies;
1814
1815	mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */
1816	if (!mq->bg_work)
1817		goto bad_btracker;
1818
1819	mq->migrations_allowed = migrations_allowed;
1820	mq->cleaner = cleaner;
1821
1822	return &mq->policy;
1823
1824bad_btracker:
1825	h_exit(&mq->hotspot_table);
1826bad_alloc_hotspot_table:
1827	h_exit(&mq->table);
1828bad_alloc_table:
1829	free_bitset(mq->cache_hit_bits);
1830bad_cache_hit_bits:
1831	free_bitset(mq->hotspot_hit_bits);
1832bad_hotspot_hit_bits:
1833	space_exit(&mq->es);
1834bad_pool_init:
1835	kfree(mq);
1836
1837	return NULL;
1838}
1839
1840static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
1841					  sector_t origin_size,
1842					  sector_t cache_block_size)
1843{
1844	return __smq_create(cache_size, origin_size, cache_block_size,
1845			    false, true, false);
1846}
1847
1848static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1849					 sector_t origin_size,
1850					 sector_t cache_block_size)
1851{
1852	return __smq_create(cache_size, origin_size, cache_block_size,
1853			    true, true, false);
1854}
1855
1856static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
1857					      sector_t origin_size,
1858					      sector_t cache_block_size)
1859{
1860	return __smq_create(cache_size, origin_size, cache_block_size,
1861			    false, false, true);
1862}
1863
1864/*----------------------------------------------------------------*/
1865
1866static struct dm_cache_policy_type smq_policy_type = {
1867	.name = "smq",
1868	.version = {2, 0, 0},
1869	.hint_size = 4,
1870	.owner = THIS_MODULE,
1871	.create = smq_create
1872};
1873
1874static struct dm_cache_policy_type mq_policy_type = {
1875	.name = "mq",
1876	.version = {2, 0, 0},
1877	.hint_size = 4,
1878	.owner = THIS_MODULE,
1879	.create = mq_create,
1880};
1881
1882static struct dm_cache_policy_type cleaner_policy_type = {
1883	.name = "cleaner",
1884	.version = {2, 0, 0},
1885	.hint_size = 4,
1886	.owner = THIS_MODULE,
1887	.create = cleaner_create,
1888};
1889
1890static struct dm_cache_policy_type default_policy_type = {
1891	.name = "default",
1892	.version = {2, 0, 0},
1893	.hint_size = 4,
1894	.owner = THIS_MODULE,
1895	.create = smq_create,
1896	.real = &smq_policy_type
1897};
1898
1899static int __init smq_init(void)
1900{
1901	int r;
1902
1903	r = dm_cache_policy_register(&smq_policy_type);
1904	if (r) {
1905		DMERR("register failed %d", r);
1906		return -ENOMEM;
1907	}
1908
1909	r = dm_cache_policy_register(&mq_policy_type);
1910	if (r) {
1911		DMERR("register failed (as mq) %d", r);
1912		goto out_mq;
1913	}
1914
1915	r = dm_cache_policy_register(&cleaner_policy_type);
1916	if (r) {
1917		DMERR("register failed (as cleaner) %d", r);
1918		goto out_cleaner;
1919	}
1920
1921	r = dm_cache_policy_register(&default_policy_type);
1922	if (r) {
1923		DMERR("register failed (as default) %d", r);
1924		goto out_default;
1925	}
1926
1927	return 0;
1928
1929out_default:
1930	dm_cache_policy_unregister(&cleaner_policy_type);
1931out_cleaner:
1932	dm_cache_policy_unregister(&mq_policy_type);
1933out_mq:
1934	dm_cache_policy_unregister(&smq_policy_type);
1935
1936	return -ENOMEM;
1937}
1938
1939static void __exit smq_exit(void)
1940{
1941	dm_cache_policy_unregister(&cleaner_policy_type);
1942	dm_cache_policy_unregister(&smq_policy_type);
1943	dm_cache_policy_unregister(&mq_policy_type);
1944	dm_cache_policy_unregister(&default_policy_type);
1945}
1946
1947module_init(smq_init);
1948module_exit(smq_exit);
1949
1950MODULE_AUTHOR("Joe Thornber <dm-devel@lists.linux.dev>");
1951MODULE_LICENSE("GPL");
1952MODULE_DESCRIPTION("smq cache policy");
1953
1954MODULE_ALIAS("dm-cache-default");
1955MODULE_ALIAS("dm-cache-mq");
1956MODULE_ALIAS("dm-cache-cleaner");
1957