1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright 2023 Red Hat
4 */
5
6#ifndef VDO_SLAB_DEPOT_H
7#define VDO_SLAB_DEPOT_H
8
9#include <linux/atomic.h>
10#include <linux/dm-kcopyd.h>
11#include <linux/list.h>
12
13#include "numeric.h"
14
15#include "admin-state.h"
16#include "completion.h"
17#include "data-vio.h"
18#include "encodings.h"
19#include "physical-zone.h"
20#include "priority-table.h"
21#include "recovery-journal.h"
22#include "statistics.h"
23#include "types.h"
24#include "vio.h"
25#include "wait-queue.h"
26
27/*
28 * A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has
29 * a single array of slabs in order to eliminate the need for additional math in order to compute
30 * which physical zone a PBN is in. It also has a block_allocator per zone.
31 *
32 * Each physical zone has a single dedicated queue and thread for performing all updates to the
33 * slabs assigned to that zone. The concurrency guarantees of this single-threaded model allow the
34 * code to omit more fine-grained locking for the various slab structures. Each physical zone
35 * maintains a separate copy of the slab summary to remove the need for explicit locking on that
36 * structure as well.
37 *
38 * Load operations must be performed on the admin thread. Normal operations, such as allocations
39 * and reference count updates, must be performed on the appropriate physical zone thread. Requests
40 * from the recovery journal to commit slab journal tail blocks must be scheduled from the recovery
41 * journal thread to run on the appropriate physical zone thread. Save operations must be launched
42 * from the same admin thread as the original load operation.
43 */
44
45enum {
46	/* The number of vios in the vio pool is proportional to the throughput of the VDO. */
47	BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128,
48};
49
50/*
51 * Represents the possible status of a block.
52 */
53enum reference_status {
54	RS_FREE, /* this block is free */
55	RS_SINGLE, /* this block is singly-referenced */
56	RS_SHARED, /* this block is shared */
57	RS_PROVISIONAL /* this block is provisionally allocated */
58};
59
60struct vdo_slab;
61
62struct journal_lock {
63	u16 count;
64	sequence_number_t recovery_start;
65};
66
67struct slab_journal {
68	/* A waiter object for getting a VIO pool entry */
69	struct vdo_waiter resource_waiter;
70	/* A waiter object for updating the slab summary */
71	struct vdo_waiter slab_summary_waiter;
72	/* A waiter object for getting a vio with which to flush */
73	struct vdo_waiter flush_waiter;
74	/* The queue of VIOs waiting to make an entry */
75	struct vdo_wait_queue entry_waiters;
76	/* The parent slab reference of this journal */
77	struct vdo_slab *slab;
78
79	/* Whether a tail block commit is pending */
80	bool waiting_to_commit;
81	/* Whether the journal is updating the slab summary */
82	bool updating_slab_summary;
83	/* Whether the journal is adding entries from the entry_waiters queue */
84	bool adding_entries;
85	/* Whether a partial write is in progress */
86	bool partial_write_in_progress;
87
88	/* The oldest block in the journal on disk */
89	sequence_number_t head;
90	/* The oldest block in the journal which may not be reaped */
91	sequence_number_t unreapable;
92	/* The end of the half-open interval of the active journal */
93	sequence_number_t tail;
94	/* The next journal block to be committed */
95	sequence_number_t next_commit;
96	/* The tail sequence number that is written in the slab summary */
97	sequence_number_t summarized;
98	/* The tail sequence number that was last summarized in slab summary */
99	sequence_number_t last_summarized;
100
101	/* The sequence number of the recovery journal lock */
102	sequence_number_t recovery_lock;
103
104	/*
105	 * The number of entries which fit in a single block. Can't use the constant because unit
106	 * tests change this number.
107	 */
108	journal_entry_count_t entries_per_block;
109	/*
110	 * The number of full entries which fit in a single block. Can't use the constant because
111	 * unit tests change this number.
112	 */
113	journal_entry_count_t full_entries_per_block;
114
115	/* The recovery journal of the VDO (slab journal holds locks on it) */
116	struct recovery_journal *recovery_journal;
117
118	/* The statistics shared by all slab journals in our physical zone */
119	struct slab_journal_statistics *events;
120	/* A list of the VIO pool entries for outstanding journal block writes */
121	struct list_head uncommitted_blocks;
122
123	/*
124	 * The current tail block header state. This will be packed into the block just before it
125	 * is written.
126	 */
127	struct slab_journal_block_header tail_header;
128	/* A pointer to a block-sized buffer holding the packed block data */
129	struct packed_slab_journal_block *block;
130
131	/* The number of blocks in the on-disk journal */
132	block_count_t size;
133	/* The number of blocks at which to start pushing reference blocks */
134	block_count_t flushing_threshold;
135	/* The number of blocks at which all reference blocks should be writing */
136	block_count_t flushing_deadline;
137	/* The number of blocks at which to wait for reference blocks to write */
138	block_count_t blocking_threshold;
139	/* The number of blocks at which to scrub the slab before coming online */
140	block_count_t scrubbing_threshold;
141
142	/* This list entry is for block_allocator to keep a queue of dirty journals */
143	struct list_head dirty_entry;
144
145	/* The lock for the oldest unreaped block of the journal */
146	struct journal_lock *reap_lock;
147	/* The locks for each on disk block */
148	struct journal_lock *locks;
149};
150
151/*
152 * Reference_block structure
153 *
154 * Blocks are used as a proxy, permitting saves of partial refcounts.
155 */
156struct reference_block {
157	/* This block waits on the ref_counts to tell it to write */
158	struct vdo_waiter waiter;
159	/* The slab to which this reference_block belongs */
160	struct vdo_slab *slab;
161	/* The number of references in this block that represent allocations */
162	block_size_t allocated_count;
163	/* The slab journal block on which this block must hold a lock */
164	sequence_number_t slab_journal_lock;
165	/* The slab journal block which should be released when this block is committed */
166	sequence_number_t slab_journal_lock_to_release;
167	/* The point up to which each sector is accurate on disk */
168	struct journal_point commit_points[VDO_SECTORS_PER_BLOCK];
169	/* Whether this block has been modified since it was written to disk */
170	bool is_dirty;
171	/* Whether this block is currently writing */
172	bool is_writing;
173};
174
175/* The search_cursor represents the saved position of a free block search. */
176struct search_cursor {
177	/* The reference block containing the current search index */
178	struct reference_block *block;
179	/* The position at which to start searching for the next free counter */
180	slab_block_number index;
181	/* The position just past the last valid counter in the current block */
182	slab_block_number end_index;
183
184	/* A pointer to the first reference block in the slab */
185	struct reference_block *first_block;
186	/* A pointer to the last reference block in the slab */
187	struct reference_block *last_block;
188};
189
190enum slab_rebuild_status {
191	VDO_SLAB_REBUILT,
192	VDO_SLAB_REPLAYING,
193	VDO_SLAB_REQUIRES_SCRUBBING,
194	VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING,
195	VDO_SLAB_REBUILDING,
196};
197
198/*
199 * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of
200 * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for
201 * metadata storage for the reference counts and slab journal for the slab.
202 *
203 * A reference count is maintained for each physical block number. The vast majority of blocks have
204 * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS
205 * (254) the reference count is stored in counters[pbn].
206 */
207struct vdo_slab {
208	/* A list entry to queue this slab in a block_allocator list */
209	struct list_head allocq_entry;
210
211	/* The struct block_allocator that owns this slab */
212	struct block_allocator *allocator;
213
214	/* The journal for this slab */
215	struct slab_journal journal;
216
217	/* The slab number of this slab */
218	slab_count_t slab_number;
219	/* The offset in the allocator partition of the first block in this slab */
220	physical_block_number_t start;
221	/* The offset of the first block past the end of this slab */
222	physical_block_number_t end;
223	/* The starting translated PBN of the slab journal */
224	physical_block_number_t journal_origin;
225	/* The starting translated PBN of the reference counts */
226	physical_block_number_t ref_counts_origin;
227
228	/* The administrative state of the slab */
229	struct admin_state state;
230	/* The status of the slab */
231	enum slab_rebuild_status status;
232	/* Whether the slab was ever queued for scrubbing */
233	bool was_queued_for_scrubbing;
234
235	/* The priority at which this slab has been queued for allocation */
236	u8 priority;
237
238	/* Fields beyond this point are the reference counts for the data blocks in this slab. */
239	/* The size of the counters array */
240	u32 block_count;
241	/* The number of free blocks */
242	u32 free_blocks;
243	/* The array of reference counts */
244	vdo_refcount_t *counters; /* use vdo_allocate() to align data ptr */
245
246	/* The saved block pointer and array indexes for the free block search */
247	struct search_cursor search_cursor;
248
249	/* A list of the dirty blocks waiting to be written out */
250	struct vdo_wait_queue dirty_blocks;
251	/* The number of blocks which are currently writing */
252	size_t active_count;
253
254	/* A waiter object for updating the slab summary */
255	struct vdo_waiter summary_waiter;
256
257	/* The latest slab journal for which there has been a reference count update */
258	struct journal_point slab_journal_point;
259
260	/* The number of reference count blocks */
261	u32 reference_block_count;
262	/* reference count block array */
263	struct reference_block *reference_blocks;
264};
265
266enum block_allocator_drain_step {
267	VDO_DRAIN_ALLOCATOR_START,
268	VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER,
269	VDO_DRAIN_ALLOCATOR_STEP_SLABS,
270	VDO_DRAIN_ALLOCATOR_STEP_SUMMARY,
271	VDO_DRAIN_ALLOCATOR_STEP_FINISHED,
272};
273
274struct slab_scrubber {
275	/* The queue of slabs to scrub first */
276	struct list_head high_priority_slabs;
277	/* The queue of slabs to scrub once there are no high_priority_slabs */
278	struct list_head slabs;
279	/* The queue of VIOs waiting for a slab to be scrubbed */
280	struct vdo_wait_queue waiters;
281
282	/*
283	 * The number of slabs that are unrecovered or being scrubbed. This field is modified by
284	 * the physical zone thread, but is queried by other threads.
285	 */
286	slab_count_t slab_count;
287
288	/* The administrative state of the scrubber */
289	struct admin_state admin_state;
290	/* Whether to only scrub high-priority slabs */
291	bool high_priority_only;
292	/* The slab currently being scrubbed */
293	struct vdo_slab *slab;
294	/* The vio for loading slab journal blocks */
295	struct vio vio;
296};
297
298/* A sub-structure for applying actions in parallel to all an allocator's slabs. */
299struct slab_actor {
300	/* The number of slabs performing a slab action */
301	slab_count_t slab_action_count;
302	/* The method to call when a slab action has been completed by all slabs */
303	vdo_action_fn callback;
304};
305
306/* A slab_iterator is a structure for iterating over a set of slabs. */
307struct slab_iterator {
308	struct vdo_slab **slabs;
309	struct vdo_slab *next;
310	slab_count_t end;
311	slab_count_t stride;
312};
313
314/*
315 * The slab_summary provides hints during load and recovery about the state of the slabs in order
316 * to avoid the need to read the slab journals in their entirety before a VDO can come online.
317 *
318 * The information in the summary for each slab includes the rough number of free blocks (which is
319 * used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free
320 * space will be used on restart), and the location of the tail block of the slab's journal.
321 *
322 * The slab_summary has its own partition at the end of the volume which is sized to allow for a
323 * complete copy of the summary for each of up to 16 physical zones.
324 *
325 * During resize, the slab_summary moves its backing partition and is saved once moved; the
326 * slab_summary is not permitted to overwrite the previous recovery journal space.
327 *
328 * The slab_summary does not have its own version information, but relies on the VDO volume version
329 * number.
330 */
331
332/*
333 * A slab status is a very small structure for use in determining the ordering of slabs in the
334 * scrubbing process.
335 */
336struct slab_status {
337	slab_count_t slab_number;
338	bool is_clean;
339	u8 emptiness;
340};
341
342struct slab_summary_block {
343	/* The block_allocator to which this block belongs */
344	struct block_allocator *allocator;
345	/* The index of this block in its zone's summary */
346	block_count_t index;
347	/* Whether this block has a write outstanding */
348	bool writing;
349	/* Ring of updates waiting on the outstanding write */
350	struct vdo_wait_queue current_update_waiters;
351	/* Ring of updates waiting on the next write */
352	struct vdo_wait_queue next_update_waiters;
353	/* The active slab_summary_entry array for this block */
354	struct slab_summary_entry *entries;
355	/* The vio used to write this block */
356	struct vio vio;
357	/* The packed entries, one block long, backing the vio */
358	char *outgoing_entries;
359};
360
361/*
362 * The statistics for all the slab summary zones owned by this slab summary. These fields are all
363 * mutated only by their physical zone threads, but are read by other threads when gathering
364 * statistics for the entire depot.
365 */
366struct atomic_slab_summary_statistics {
367	/* Number of blocks written */
368	atomic64_t blocks_written;
369};
370
371struct block_allocator {
372	struct vdo_completion completion;
373	/* The slab depot for this allocator */
374	struct slab_depot *depot;
375	/* The nonce of the VDO */
376	nonce_t nonce;
377	/* The physical zone number of this allocator */
378	zone_count_t zone_number;
379	/* The thread ID for this allocator's physical zone */
380	thread_id_t thread_id;
381	/* The number of slabs in this allocator */
382	slab_count_t slab_count;
383	/* The number of the last slab owned by this allocator */
384	slab_count_t last_slab;
385	/* The reduced priority level used to preserve unopened slabs */
386	unsigned int unopened_slab_priority;
387	/* The state of this allocator */
388	struct admin_state state;
389	/* The actor for applying an action to all slabs */
390	struct slab_actor slab_actor;
391
392	/* The slab from which blocks are currently being allocated */
393	struct vdo_slab *open_slab;
394	/* A priority queue containing all slabs available for allocation */
395	struct priority_table *prioritized_slabs;
396	/* The slab scrubber */
397	struct slab_scrubber scrubber;
398	/* What phase of the close operation the allocator is to perform */
399	enum block_allocator_drain_step drain_step;
400
401	/*
402	 * These statistics are all mutated only by the physical zone thread, but are read by other
403	 * threads when gathering statistics for the entire depot.
404	 */
405	/*
406	 * The count of allocated blocks in this zone. Not in block_allocator_statistics for
407	 * historical reasons.
408	 */
409	u64 allocated_blocks;
410	/* Statistics for this block allocator */
411	struct block_allocator_statistics statistics;
412	/* Cumulative statistics for the slab journals in this zone */
413	struct slab_journal_statistics slab_journal_statistics;
414	/* Cumulative statistics for the reference counters in this zone */
415	struct ref_counts_statistics ref_counts_statistics;
416
417	/*
418	 * This is the head of a queue of slab journals which have entries in their tail blocks
419	 * which have not yet started to commit. When the recovery journal is under space pressure,
420	 * slab journals which have uncommitted entries holding a lock on the recovery journal head
421	 * are forced to commit their blocks early. This list is kept in order, with the tail
422	 * containing the slab journal holding the most recent recovery journal lock.
423	 */
424	struct list_head dirty_slab_journals;
425
426	/* The vio pool for reading and writing block allocator metadata */
427	struct vio_pool *vio_pool;
428	/* The dm_kcopyd client for erasing slab journals */
429	struct dm_kcopyd_client *eraser;
430	/* Iterator over the slabs to be erased */
431	struct slab_iterator slabs_to_erase;
432
433	/* The portion of the slab summary managed by this allocator */
434	/* The state of the slab summary */
435	struct admin_state summary_state;
436	/* The number of outstanding summary writes */
437	block_count_t summary_write_count;
438	/* The array (owned by the blocks) of all entries */
439	struct slab_summary_entry *summary_entries;
440	/* The array of slab_summary_blocks */
441	struct slab_summary_block *summary_blocks;
442};
443
444enum slab_depot_load_type {
445	VDO_SLAB_DEPOT_NORMAL_LOAD,
446	VDO_SLAB_DEPOT_RECOVERY_LOAD,
447	VDO_SLAB_DEPOT_REBUILD_LOAD
448};
449
450struct slab_depot {
451	zone_count_t zone_count;
452	zone_count_t old_zone_count;
453	struct vdo *vdo;
454	struct slab_config slab_config;
455	struct action_manager *action_manager;
456
457	physical_block_number_t first_block;
458	physical_block_number_t last_block;
459	physical_block_number_t origin;
460
461	/* slab_size == (1 << slab_size_shift) */
462	unsigned int slab_size_shift;
463
464	/* Determines how slabs should be queued during load */
465	enum slab_depot_load_type load_type;
466
467	/* The state for notifying slab journals to release recovery journal */
468	sequence_number_t active_release_request;
469	sequence_number_t new_release_request;
470
471	/* State variables for scrubbing complete handling */
472	atomic_t zones_to_scrub;
473
474	/* Array of pointers to individually allocated slabs */
475	struct vdo_slab **slabs;
476	/* The number of slabs currently allocated and stored in 'slabs' */
477	slab_count_t slab_count;
478
479	/* Array of pointers to a larger set of slabs (used during resize) */
480	struct vdo_slab **new_slabs;
481	/* The number of slabs currently allocated and stored in 'new_slabs' */
482	slab_count_t new_slab_count;
483	/* The size that 'new_slabs' was allocated for */
484	block_count_t new_size;
485
486	/* The last block before resize, for rollback */
487	physical_block_number_t old_last_block;
488	/* The last block after resize, for resize */
489	physical_block_number_t new_last_block;
490
491	/* The statistics for the slab summary */
492	struct atomic_slab_summary_statistics summary_statistics;
493	/* The start of the slab summary partition */
494	physical_block_number_t summary_origin;
495	/* The number of bits to shift to get a 7-bit fullness hint */
496	unsigned int hint_shift;
497	/* The slab summary entries for all of the zones the partition can hold */
498	struct slab_summary_entry *summary_entries;
499
500	/* The block allocators for this depot */
501	struct block_allocator allocators[];
502};
503
504struct reference_updater;
505
506bool __must_check vdo_attempt_replay_into_slab(struct vdo_slab *slab,
507					       physical_block_number_t pbn,
508					       enum journal_operation operation,
509					       bool increment,
510					       struct journal_point *recovery_point,
511					       struct vdo_completion *parent);
512
513int __must_check vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
514							physical_block_number_t pbn,
515							enum journal_operation operation);
516
517static inline struct block_allocator *vdo_as_block_allocator(struct vdo_completion *completion)
518{
519	vdo_assert_completion_type(completion, VDO_BLOCK_ALLOCATOR_COMPLETION);
520	return container_of(completion, struct block_allocator, completion);
521}
522
523int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab,
524						   physical_block_number_t pbn,
525						   struct pbn_lock *lock);
526
527int __must_check vdo_allocate_block(struct block_allocator *allocator,
528				    physical_block_number_t *block_number_ptr);
529
530int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator,
531				  struct vdo_waiter *waiter);
532
533void vdo_modify_reference_count(struct vdo_completion *completion,
534				struct reference_updater *updater);
535
536int __must_check vdo_release_block_reference(struct block_allocator *allocator,
537					     physical_block_number_t pbn);
538
539void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion);
540
541void vdo_dump_block_allocator(const struct block_allocator *allocator);
542
543int __must_check vdo_decode_slab_depot(struct slab_depot_state_2_0 state,
544				       struct vdo *vdo,
545				       struct partition *summary_partition,
546				       struct slab_depot **depot_ptr);
547
548void vdo_free_slab_depot(struct slab_depot *depot);
549
550struct slab_depot_state_2_0 __must_check vdo_record_slab_depot(const struct slab_depot *depot);
551
552int __must_check vdo_allocate_reference_counters(struct slab_depot *depot);
553
554struct vdo_slab * __must_check vdo_get_slab(const struct slab_depot *depot,
555					    physical_block_number_t pbn);
556
557u8 __must_check vdo_get_increment_limit(struct slab_depot *depot,
558					physical_block_number_t pbn);
559
560bool __must_check vdo_is_physical_data_block(const struct slab_depot *depot,
561					     physical_block_number_t pbn);
562
563block_count_t __must_check vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot);
564
565block_count_t __must_check vdo_get_slab_depot_data_blocks(const struct slab_depot *depot);
566
567void vdo_get_slab_depot_statistics(const struct slab_depot *depot,
568				   struct vdo_statistics *stats);
569
570void vdo_load_slab_depot(struct slab_depot *depot,
571			 const struct admin_state_code *operation,
572			 struct vdo_completion *parent, void *context);
573
574void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot,
575					enum slab_depot_load_type load_type,
576					struct vdo_completion *parent);
577
578void vdo_update_slab_depot_size(struct slab_depot *depot);
579
580int __must_check vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
581						const struct partition *partition);
582
583void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent);
584
585void vdo_abandon_new_slabs(struct slab_depot *depot);
586
587void vdo_drain_slab_depot(struct slab_depot *depot,
588			  const struct admin_state_code *operation,
589			  struct vdo_completion *parent);
590
591void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent);
592
593void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
594						sequence_number_t recovery_block_number);
595
596void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot,
597				     struct vdo_completion *parent);
598
599void vdo_dump_slab_depot(const struct slab_depot *depot);
600
601#endif /* VDO_SLAB_DEPOT_H */
602