1  /* SPDX-License-Identifier: GPL-2.0-only */
2  /*
3   * Copyright 2023 Red Hat
4   */
5  
6  #ifndef VDO_SLAB_DEPOT_H
7  #define VDO_SLAB_DEPOT_H
8  
9  #include <linux/atomic.h>
10  #include <linux/dm-kcopyd.h>
11  #include <linux/list.h>
12  
13  #include "numeric.h"
14  
15  #include "admin-state.h"
16  #include "completion.h"
17  #include "data-vio.h"
18  #include "encodings.h"
19  #include "physical-zone.h"
20  #include "priority-table.h"
21  #include "recovery-journal.h"
22  #include "statistics.h"
23  #include "types.h"
24  #include "vio.h"
25  #include "wait-queue.h"
26  
27  /*
28   * A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has
29   * a single array of slabs in order to eliminate the need for additional math in order to compute
30   * which physical zone a PBN is in. It also has a block_allocator per zone.
31   *
32   * Each physical zone has a single dedicated queue and thread for performing all updates to the
33   * slabs assigned to that zone. The concurrency guarantees of this single-threaded model allow the
34   * code to omit more fine-grained locking for the various slab structures. Each physical zone
35   * maintains a separate copy of the slab summary to remove the need for explicit locking on that
36   * structure as well.
37   *
38   * Load operations must be performed on the admin thread. Normal operations, such as allocations
39   * and reference count updates, must be performed on the appropriate physical zone thread. Requests
40   * from the recovery journal to commit slab journal tail blocks must be scheduled from the recovery
41   * journal thread to run on the appropriate physical zone thread. Save operations must be launched
42   * from the same admin thread as the original load operation.
43   */
44  
45  enum {
46  	/* The number of vios in the vio pool is proportional to the throughput of the VDO. */
47  	BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128,
48  };
49  
50  /*
51   * Represents the possible status of a block.
52   */
53  enum reference_status {
54  	RS_FREE, /* this block is free */
55  	RS_SINGLE, /* this block is singly-referenced */
56  	RS_SHARED, /* this block is shared */
57  	RS_PROVISIONAL /* this block is provisionally allocated */
58  };
59  
60  struct vdo_slab;
61  
62  struct journal_lock {
63  	u16 count;
64  	sequence_number_t recovery_start;
65  };
66  
67  struct slab_journal {
68  	/* A waiter object for getting a VIO pool entry */
69  	struct vdo_waiter resource_waiter;
70  	/* A waiter object for updating the slab summary */
71  	struct vdo_waiter slab_summary_waiter;
72  	/* A waiter object for getting a vio with which to flush */
73  	struct vdo_waiter flush_waiter;
74  	/* The queue of VIOs waiting to make an entry */
75  	struct vdo_wait_queue entry_waiters;
76  	/* The parent slab reference of this journal */
77  	struct vdo_slab *slab;
78  
79  	/* Whether a tail block commit is pending */
80  	bool waiting_to_commit;
81  	/* Whether the journal is updating the slab summary */
82  	bool updating_slab_summary;
83  	/* Whether the journal is adding entries from the entry_waiters queue */
84  	bool adding_entries;
85  	/* Whether a partial write is in progress */
86  	bool partial_write_in_progress;
87  
88  	/* The oldest block in the journal on disk */
89  	sequence_number_t head;
90  	/* The oldest block in the journal which may not be reaped */
91  	sequence_number_t unreapable;
92  	/* The end of the half-open interval of the active journal */
93  	sequence_number_t tail;
94  	/* The next journal block to be committed */
95  	sequence_number_t next_commit;
96  	/* The tail sequence number that is written in the slab summary */
97  	sequence_number_t summarized;
98  	/* The tail sequence number that was last summarized in slab summary */
99  	sequence_number_t last_summarized;
100  
101  	/* The sequence number of the recovery journal lock */
102  	sequence_number_t recovery_lock;
103  
104  	/*
105  	 * The number of entries which fit in a single block. Can't use the constant because unit
106  	 * tests change this number.
107  	 */
108  	journal_entry_count_t entries_per_block;
109  	/*
110  	 * The number of full entries which fit in a single block. Can't use the constant because
111  	 * unit tests change this number.
112  	 */
113  	journal_entry_count_t full_entries_per_block;
114  
115  	/* The recovery journal of the VDO (slab journal holds locks on it) */
116  	struct recovery_journal *recovery_journal;
117  
118  	/* The statistics shared by all slab journals in our physical zone */
119  	struct slab_journal_statistics *events;
120  	/* A list of the VIO pool entries for outstanding journal block writes */
121  	struct list_head uncommitted_blocks;
122  
123  	/*
124  	 * The current tail block header state. This will be packed into the block just before it
125  	 * is written.
126  	 */
127  	struct slab_journal_block_header tail_header;
128  	/* A pointer to a block-sized buffer holding the packed block data */
129  	struct packed_slab_journal_block *block;
130  
131  	/* The number of blocks in the on-disk journal */
132  	block_count_t size;
133  	/* The number of blocks at which to start pushing reference blocks */
134  	block_count_t flushing_threshold;
135  	/* The number of blocks at which all reference blocks should be writing */
136  	block_count_t flushing_deadline;
137  	/* The number of blocks at which to wait for reference blocks to write */
138  	block_count_t blocking_threshold;
139  	/* The number of blocks at which to scrub the slab before coming online */
140  	block_count_t scrubbing_threshold;
141  
142  	/* This list entry is for block_allocator to keep a queue of dirty journals */
143  	struct list_head dirty_entry;
144  
145  	/* The lock for the oldest unreaped block of the journal */
146  	struct journal_lock *reap_lock;
147  	/* The locks for each on disk block */
148  	struct journal_lock *locks;
149  };
150  
151  /*
152   * Reference_block structure
153   *
154   * Blocks are used as a proxy, permitting saves of partial refcounts.
155   */
156  struct reference_block {
157  	/* This block waits on the ref_counts to tell it to write */
158  	struct vdo_waiter waiter;
159  	/* The slab to which this reference_block belongs */
160  	struct vdo_slab *slab;
161  	/* The number of references in this block that represent allocations */
162  	block_size_t allocated_count;
163  	/* The slab journal block on which this block must hold a lock */
164  	sequence_number_t slab_journal_lock;
165  	/* The slab journal block which should be released when this block is committed */
166  	sequence_number_t slab_journal_lock_to_release;
167  	/* The point up to which each sector is accurate on disk */
168  	struct journal_point commit_points[VDO_SECTORS_PER_BLOCK];
169  	/* Whether this block has been modified since it was written to disk */
170  	bool is_dirty;
171  	/* Whether this block is currently writing */
172  	bool is_writing;
173  };
174  
175  /* The search_cursor represents the saved position of a free block search. */
176  struct search_cursor {
177  	/* The reference block containing the current search index */
178  	struct reference_block *block;
179  	/* The position at which to start searching for the next free counter */
180  	slab_block_number index;
181  	/* The position just past the last valid counter in the current block */
182  	slab_block_number end_index;
183  
184  	/* A pointer to the first reference block in the slab */
185  	struct reference_block *first_block;
186  	/* A pointer to the last reference block in the slab */
187  	struct reference_block *last_block;
188  };
189  
190  enum slab_rebuild_status {
191  	VDO_SLAB_REBUILT,
192  	VDO_SLAB_REPLAYING,
193  	VDO_SLAB_REQUIRES_SCRUBBING,
194  	VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING,
195  	VDO_SLAB_REBUILDING,
196  };
197  
198  /*
199   * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of
200   * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for
201   * metadata storage for the reference counts and slab journal for the slab.
202   *
203   * A reference count is maintained for each physical block number. The vast majority of blocks have
204   * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS
205   * (254) the reference count is stored in counters[pbn].
206   */
207  struct vdo_slab {
208  	/* A list entry to queue this slab in a block_allocator list */
209  	struct list_head allocq_entry;
210  
211  	/* The struct block_allocator that owns this slab */
212  	struct block_allocator *allocator;
213  
214  	/* The journal for this slab */
215  	struct slab_journal journal;
216  
217  	/* The slab number of this slab */
218  	slab_count_t slab_number;
219  	/* The offset in the allocator partition of the first block in this slab */
220  	physical_block_number_t start;
221  	/* The offset of the first block past the end of this slab */
222  	physical_block_number_t end;
223  	/* The starting translated PBN of the slab journal */
224  	physical_block_number_t journal_origin;
225  	/* The starting translated PBN of the reference counts */
226  	physical_block_number_t ref_counts_origin;
227  
228  	/* The administrative state of the slab */
229  	struct admin_state state;
230  	/* The status of the slab */
231  	enum slab_rebuild_status status;
232  	/* Whether the slab was ever queued for scrubbing */
233  	bool was_queued_for_scrubbing;
234  
235  	/* The priority at which this slab has been queued for allocation */
236  	u8 priority;
237  
238  	/* Fields beyond this point are the reference counts for the data blocks in this slab. */
239  	/* The size of the counters array */
240  	u32 block_count;
241  	/* The number of free blocks */
242  	u32 free_blocks;
243  	/* The array of reference counts */
244  	vdo_refcount_t *counters; /* use vdo_allocate() to align data ptr */
245  
246  	/* The saved block pointer and array indexes for the free block search */
247  	struct search_cursor search_cursor;
248  
249  	/* A list of the dirty blocks waiting to be written out */
250  	struct vdo_wait_queue dirty_blocks;
251  	/* The number of blocks which are currently writing */
252  	size_t active_count;
253  
254  	/* A waiter object for updating the slab summary */
255  	struct vdo_waiter summary_waiter;
256  
257  	/* The latest slab journal for which there has been a reference count update */
258  	struct journal_point slab_journal_point;
259  
260  	/* The number of reference count blocks */
261  	u32 reference_block_count;
262  	/* reference count block array */
263  	struct reference_block *reference_blocks;
264  };
265  
266  enum block_allocator_drain_step {
267  	VDO_DRAIN_ALLOCATOR_START,
268  	VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER,
269  	VDO_DRAIN_ALLOCATOR_STEP_SLABS,
270  	VDO_DRAIN_ALLOCATOR_STEP_SUMMARY,
271  	VDO_DRAIN_ALLOCATOR_STEP_FINISHED,
272  };
273  
274  struct slab_scrubber {
275  	/* The queue of slabs to scrub first */
276  	struct list_head high_priority_slabs;
277  	/* The queue of slabs to scrub once there are no high_priority_slabs */
278  	struct list_head slabs;
279  	/* The queue of VIOs waiting for a slab to be scrubbed */
280  	struct vdo_wait_queue waiters;
281  
282  	/*
283  	 * The number of slabs that are unrecovered or being scrubbed. This field is modified by
284  	 * the physical zone thread, but is queried by other threads.
285  	 */
286  	slab_count_t slab_count;
287  
288  	/* The administrative state of the scrubber */
289  	struct admin_state admin_state;
290  	/* Whether to only scrub high-priority slabs */
291  	bool high_priority_only;
292  	/* The slab currently being scrubbed */
293  	struct vdo_slab *slab;
294  	/* The vio for loading slab journal blocks */
295  	struct vio vio;
296  };
297  
298  /* A sub-structure for applying actions in parallel to all an allocator's slabs. */
299  struct slab_actor {
300  	/* The number of slabs performing a slab action */
301  	slab_count_t slab_action_count;
302  	/* The method to call when a slab action has been completed by all slabs */
303  	vdo_action_fn callback;
304  };
305  
306  /* A slab_iterator is a structure for iterating over a set of slabs. */
307  struct slab_iterator {
308  	struct vdo_slab **slabs;
309  	struct vdo_slab *next;
310  	slab_count_t end;
311  	slab_count_t stride;
312  };
313  
314  /*
315   * The slab_summary provides hints during load and recovery about the state of the slabs in order
316   * to avoid the need to read the slab journals in their entirety before a VDO can come online.
317   *
318   * The information in the summary for each slab includes the rough number of free blocks (which is
319   * used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free
320   * space will be used on restart), and the location of the tail block of the slab's journal.
321   *
322   * The slab_summary has its own partition at the end of the volume which is sized to allow for a
323   * complete copy of the summary for each of up to 16 physical zones.
324   *
325   * During resize, the slab_summary moves its backing partition and is saved once moved; the
326   * slab_summary is not permitted to overwrite the previous recovery journal space.
327   *
328   * The slab_summary does not have its own version information, but relies on the VDO volume version
329   * number.
330   */
331  
332  /*
333   * A slab status is a very small structure for use in determining the ordering of slabs in the
334   * scrubbing process.
335   */
336  struct slab_status {
337  	slab_count_t slab_number;
338  	bool is_clean;
339  	u8 emptiness;
340  };
341  
342  struct slab_summary_block {
343  	/* The block_allocator to which this block belongs */
344  	struct block_allocator *allocator;
345  	/* The index of this block in its zone's summary */
346  	block_count_t index;
347  	/* Whether this block has a write outstanding */
348  	bool writing;
349  	/* Ring of updates waiting on the outstanding write */
350  	struct vdo_wait_queue current_update_waiters;
351  	/* Ring of updates waiting on the next write */
352  	struct vdo_wait_queue next_update_waiters;
353  	/* The active slab_summary_entry array for this block */
354  	struct slab_summary_entry *entries;
355  	/* The vio used to write this block */
356  	struct vio vio;
357  	/* The packed entries, one block long, backing the vio */
358  	char *outgoing_entries;
359  };
360  
361  /*
362   * The statistics for all the slab summary zones owned by this slab summary. These fields are all
363   * mutated only by their physical zone threads, but are read by other threads when gathering
364   * statistics for the entire depot.
365   */
366  struct atomic_slab_summary_statistics {
367  	/* Number of blocks written */
368  	atomic64_t blocks_written;
369  };
370  
371  struct block_allocator {
372  	struct vdo_completion completion;
373  	/* The slab depot for this allocator */
374  	struct slab_depot *depot;
375  	/* The nonce of the VDO */
376  	nonce_t nonce;
377  	/* The physical zone number of this allocator */
378  	zone_count_t zone_number;
379  	/* The thread ID for this allocator's physical zone */
380  	thread_id_t thread_id;
381  	/* The number of slabs in this allocator */
382  	slab_count_t slab_count;
383  	/* The number of the last slab owned by this allocator */
384  	slab_count_t last_slab;
385  	/* The reduced priority level used to preserve unopened slabs */
386  	unsigned int unopened_slab_priority;
387  	/* The state of this allocator */
388  	struct admin_state state;
389  	/* The actor for applying an action to all slabs */
390  	struct slab_actor slab_actor;
391  
392  	/* The slab from which blocks are currently being allocated */
393  	struct vdo_slab *open_slab;
394  	/* A priority queue containing all slabs available for allocation */
395  	struct priority_table *prioritized_slabs;
396  	/* The slab scrubber */
397  	struct slab_scrubber scrubber;
398  	/* What phase of the close operation the allocator is to perform */
399  	enum block_allocator_drain_step drain_step;
400  
401  	/*
402  	 * These statistics are all mutated only by the physical zone thread, but are read by other
403  	 * threads when gathering statistics for the entire depot.
404  	 */
405  	/*
406  	 * The count of allocated blocks in this zone. Not in block_allocator_statistics for
407  	 * historical reasons.
408  	 */
409  	u64 allocated_blocks;
410  	/* Statistics for this block allocator */
411  	struct block_allocator_statistics statistics;
412  	/* Cumulative statistics for the slab journals in this zone */
413  	struct slab_journal_statistics slab_journal_statistics;
414  	/* Cumulative statistics for the reference counters in this zone */
415  	struct ref_counts_statistics ref_counts_statistics;
416  
417  	/*
418  	 * This is the head of a queue of slab journals which have entries in their tail blocks
419  	 * which have not yet started to commit. When the recovery journal is under space pressure,
420  	 * slab journals which have uncommitted entries holding a lock on the recovery journal head
421  	 * are forced to commit their blocks early. This list is kept in order, with the tail
422  	 * containing the slab journal holding the most recent recovery journal lock.
423  	 */
424  	struct list_head dirty_slab_journals;
425  
426  	/* The vio pool for reading and writing block allocator metadata */
427  	struct vio_pool *vio_pool;
428  	/* The dm_kcopyd client for erasing slab journals */
429  	struct dm_kcopyd_client *eraser;
430  	/* Iterator over the slabs to be erased */
431  	struct slab_iterator slabs_to_erase;
432  
433  	/* The portion of the slab summary managed by this allocator */
434  	/* The state of the slab summary */
435  	struct admin_state summary_state;
436  	/* The number of outstanding summary writes */
437  	block_count_t summary_write_count;
438  	/* The array (owned by the blocks) of all entries */
439  	struct slab_summary_entry *summary_entries;
440  	/* The array of slab_summary_blocks */
441  	struct slab_summary_block *summary_blocks;
442  };
443  
444  enum slab_depot_load_type {
445  	VDO_SLAB_DEPOT_NORMAL_LOAD,
446  	VDO_SLAB_DEPOT_RECOVERY_LOAD,
447  	VDO_SLAB_DEPOT_REBUILD_LOAD
448  };
449  
450  struct slab_depot {
451  	zone_count_t zone_count;
452  	zone_count_t old_zone_count;
453  	struct vdo *vdo;
454  	struct slab_config slab_config;
455  	struct action_manager *action_manager;
456  
457  	physical_block_number_t first_block;
458  	physical_block_number_t last_block;
459  	physical_block_number_t origin;
460  
461  	/* slab_size == (1 << slab_size_shift) */
462  	unsigned int slab_size_shift;
463  
464  	/* Determines how slabs should be queued during load */
465  	enum slab_depot_load_type load_type;
466  
467  	/* The state for notifying slab journals to release recovery journal */
468  	sequence_number_t active_release_request;
469  	sequence_number_t new_release_request;
470  
471  	/* State variables for scrubbing complete handling */
472  	atomic_t zones_to_scrub;
473  
474  	/* Array of pointers to individually allocated slabs */
475  	struct vdo_slab **slabs;
476  	/* The number of slabs currently allocated and stored in 'slabs' */
477  	slab_count_t slab_count;
478  
479  	/* Array of pointers to a larger set of slabs (used during resize) */
480  	struct vdo_slab **new_slabs;
481  	/* The number of slabs currently allocated and stored in 'new_slabs' */
482  	slab_count_t new_slab_count;
483  	/* The size that 'new_slabs' was allocated for */
484  	block_count_t new_size;
485  
486  	/* The last block before resize, for rollback */
487  	physical_block_number_t old_last_block;
488  	/* The last block after resize, for resize */
489  	physical_block_number_t new_last_block;
490  
491  	/* The statistics for the slab summary */
492  	struct atomic_slab_summary_statistics summary_statistics;
493  	/* The start of the slab summary partition */
494  	physical_block_number_t summary_origin;
495  	/* The number of bits to shift to get a 7-bit fullness hint */
496  	unsigned int hint_shift;
497  	/* The slab summary entries for all of the zones the partition can hold */
498  	struct slab_summary_entry *summary_entries;
499  
500  	/* The block allocators for this depot */
501  	struct block_allocator allocators[];
502  };
503  
504  struct reference_updater;
505  
506  bool __must_check vdo_attempt_replay_into_slab(struct vdo_slab *slab,
507  					       physical_block_number_t pbn,
508  					       enum journal_operation operation,
509  					       bool increment,
510  					       struct journal_point *recovery_point,
511  					       struct vdo_completion *parent);
512  
513  int __must_check vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
514  							physical_block_number_t pbn,
515  							enum journal_operation operation);
516  
vdo_as_block_allocator(struct vdo_completion * completion)517  static inline struct block_allocator *vdo_as_block_allocator(struct vdo_completion *completion)
518  {
519  	vdo_assert_completion_type(completion, VDO_BLOCK_ALLOCATOR_COMPLETION);
520  	return container_of(completion, struct block_allocator, completion);
521  }
522  
523  int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab,
524  						   physical_block_number_t pbn,
525  						   struct pbn_lock *lock);
526  
527  int __must_check vdo_allocate_block(struct block_allocator *allocator,
528  				    physical_block_number_t *block_number_ptr);
529  
530  int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator,
531  				  struct vdo_waiter *waiter);
532  
533  void vdo_modify_reference_count(struct vdo_completion *completion,
534  				struct reference_updater *updater);
535  
536  int __must_check vdo_release_block_reference(struct block_allocator *allocator,
537  					     physical_block_number_t pbn);
538  
539  void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion);
540  
541  void vdo_dump_block_allocator(const struct block_allocator *allocator);
542  
543  int __must_check vdo_decode_slab_depot(struct slab_depot_state_2_0 state,
544  				       struct vdo *vdo,
545  				       struct partition *summary_partition,
546  				       struct slab_depot **depot_ptr);
547  
548  void vdo_free_slab_depot(struct slab_depot *depot);
549  
550  struct slab_depot_state_2_0 __must_check vdo_record_slab_depot(const struct slab_depot *depot);
551  
552  int __must_check vdo_allocate_reference_counters(struct slab_depot *depot);
553  
554  struct vdo_slab * __must_check vdo_get_slab(const struct slab_depot *depot,
555  					    physical_block_number_t pbn);
556  
557  u8 __must_check vdo_get_increment_limit(struct slab_depot *depot,
558  					physical_block_number_t pbn);
559  
560  bool __must_check vdo_is_physical_data_block(const struct slab_depot *depot,
561  					     physical_block_number_t pbn);
562  
563  block_count_t __must_check vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot);
564  
565  block_count_t __must_check vdo_get_slab_depot_data_blocks(const struct slab_depot *depot);
566  
567  void vdo_get_slab_depot_statistics(const struct slab_depot *depot,
568  				   struct vdo_statistics *stats);
569  
570  void vdo_load_slab_depot(struct slab_depot *depot,
571  			 const struct admin_state_code *operation,
572  			 struct vdo_completion *parent, void *context);
573  
574  void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot,
575  					enum slab_depot_load_type load_type,
576  					struct vdo_completion *parent);
577  
578  void vdo_update_slab_depot_size(struct slab_depot *depot);
579  
580  int __must_check vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
581  						const struct partition *partition);
582  
583  void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent);
584  
585  void vdo_abandon_new_slabs(struct slab_depot *depot);
586  
587  void vdo_drain_slab_depot(struct slab_depot *depot,
588  			  const struct admin_state_code *operation,
589  			  struct vdo_completion *parent);
590  
591  void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent);
592  
593  void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
594  						sequence_number_t recovery_block_number);
595  
596  void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot,
597  				     struct vdo_completion *parent);
598  
599  void vdo_dump_slab_depot(const struct slab_depot *depot);
600  
601  #endif /* VDO_SLAB_DEPOT_H */
602