summaryrefslogtreecommitdiff
path: root/drivers/md/dm-vdo/block-map.h
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm-vdo/block-map.h')
-rw-r--r--drivers/md/dm-vdo/block-map.h394
1 files changed, 394 insertions, 0 deletions
diff --git a/drivers/md/dm-vdo/block-map.h b/drivers/md/dm-vdo/block-map.h
new file mode 100644
index 000000000000..39a13039e4a3
--- /dev/null
+++ b/drivers/md/dm-vdo/block-map.h
@@ -0,0 +1,394 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_BLOCK_MAP_H
+#define VDO_BLOCK_MAP_H
+
+#include <linux/list.h>
+
+#include "numeric.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "encodings.h"
+#include "int-map.h"
+#include "statistics.h"
+#include "types.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/*
+ * The block map is responsible for tracking all the logical to physical mappings of a VDO. It
+ * consists of a collection of 60 radix trees gradually allocated as logical addresses are used.
+ * Each tree is assigned to a logical zone such that it is easy to compute which zone must handle
+ * each logical address. Each logical zone also has a dedicated portion of the leaf page cache.
+ *
+ * Each logical zone has a single dedicated queue and thread for performing all updates to the
+ * radix trees assigned to that zone. The concurrency guarantees of this single-threaded model
+ * allow the code to omit more fine-grained locking for the block map structures.
+ *
+ * Load operations must be performed on the admin thread. Normal operations, such as reading and
+ * updating mappings, must be performed on the appropriate logical zone thread. Save operations
+ * must be launched from the same admin thread as the original load operation.
+ */
+
+enum {
+ BLOCK_MAP_VIO_POOL_SIZE = 64,
+};
+
+/*
+ * Generation counter for page references.
+ */
+typedef u32 vdo_page_generation;
+
+extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY;
+
+/* The VDO Page Cache abstraction. */
+struct vdo_page_cache {
+ /* the VDO which owns this cache */
+ struct vdo *vdo;
+ /* number of pages in cache */
+ page_count_t page_count;
+ /* number of pages to write in the current batch */
+ page_count_t pages_in_batch;
+ /* Whether the VDO is doing a read-only rebuild */
+ bool rebuilding;
+
+ /* array of page information entries */
+ struct page_info *infos;
+ /* raw memory for pages */
+ char *pages;
+ /* cache last found page info */
+ struct page_info *last_found;
+ /* map of page number to info */
+ struct int_map *page_map;
+ /* main LRU list (all infos) */
+ struct list_head lru_list;
+ /* free page list (oldest first) */
+ struct list_head free_list;
+ /* outgoing page list */
+ struct list_head outgoing_list;
+ /* number of read I/O operations pending */
+ page_count_t outstanding_reads;
+ /* number of write I/O operations pending */
+ page_count_t outstanding_writes;
+ /* number of pages covered by the current flush */
+ page_count_t pages_in_flush;
+ /* number of pages waiting to be included in the next flush */
+ page_count_t pages_to_flush;
+ /* number of discards in progress */
+ unsigned int discard_count;
+ /* how many VPCs waiting for free page */
+ unsigned int waiter_count;
+ /* queue of waiters who want a free page */
+ struct vdo_wait_queue free_waiters;
+ /*
+ * Statistics are only updated on the logical zone thread, but are accessed from other
+ * threads.
+ */
+ struct block_map_statistics stats;
+ /* counter for pressure reports */
+ u32 pressure_report;
+ /* the block map zone to which this cache belongs */
+ struct block_map_zone *zone;
+};
+
+/*
+ * The state of a page buffer. If the page buffer is free no particular page is bound to it,
+ * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If
+ * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is
+ * in flight (incoming or outgoing) and its data should not be accessed.
+ *
+ * @note Update the static data in get_page_state_name() if you change this enumeration.
+ */
+enum vdo_page_buffer_state {
+ /* this page buffer is not being used */
+ PS_FREE,
+ /* this page is being read from store */
+ PS_INCOMING,
+ /* attempt to load this page failed */
+ PS_FAILED,
+ /* this page is valid and un-modified */
+ PS_RESIDENT,
+ /* this page is valid and modified */
+ PS_DIRTY,
+ /* this page is being written and should not be used */
+ PS_OUTGOING,
+ /* not a state */
+ PAGE_STATE_COUNT,
+} __packed;
+
+/*
+ * The write status of page
+ */
+enum vdo_page_write_status {
+ WRITE_STATUS_NORMAL,
+ WRITE_STATUS_DISCARD,
+ WRITE_STATUS_DEFERRED,
+} __packed;
+
+/* Per-page-slot information. */
+struct page_info {
+ /* Preallocated page struct vio */
+ struct vio *vio;
+ /* back-link for references */
+ struct vdo_page_cache *cache;
+ /* the pbn of the page */
+ physical_block_number_t pbn;
+ /* page is busy (temporarily locked) */
+ u16 busy;
+ /* the write status the page */
+ enum vdo_page_write_status write_status;
+ /* page state */
+ enum vdo_page_buffer_state state;
+ /* queue of completions awaiting this item */
+ struct vdo_wait_queue waiting;
+ /* state linked list entry */
+ struct list_head state_entry;
+ /* LRU entry */
+ struct list_head lru_entry;
+ /*
+ * The earliest recovery journal block containing uncommitted updates to the block map page
+ * associated with this page_info. A reference (lock) is held on that block to prevent it
+ * from being reaped. When this value changes, the reference on the old value must be
+ * released and a reference on the new value must be acquired.
+ */
+ sequence_number_t recovery_lock;
+};
+
+/*
+ * A completion awaiting a specific page. Also a live reference into the page once completed, until
+ * freed.
+ */
+struct vdo_page_completion {
+ /* The generic completion */
+ struct vdo_completion completion;
+ /* The cache involved */
+ struct vdo_page_cache *cache;
+ /* The waiter for the pending list */
+ struct vdo_waiter waiter;
+ /* The absolute physical block number of the page on disk */
+ physical_block_number_t pbn;
+ /* Whether the page may be modified */
+ bool writable;
+ /* Whether the page is available */
+ bool ready;
+ /* The info structure for the page, only valid when ready */
+ struct page_info *info;
+};
+
+struct forest;
+
+struct tree_page {
+ struct vdo_waiter waiter;
+
+ /* Dirty list entry */
+ struct list_head entry;
+
+ /* If dirty, the tree zone flush generation in which it was last dirtied. */
+ u8 generation;
+
+ /* Whether this page is an interior tree page being written out. */
+ bool writing;
+
+ /* If writing, the tree zone flush generation of the copy being written. */
+ u8 writing_generation;
+
+ /*
+ * Sequence number of the earliest recovery journal block containing uncommitted updates to
+ * this page
+ */
+ sequence_number_t recovery_lock;
+
+ /* The value of recovery_lock when the this page last started writing */
+ sequence_number_t writing_recovery_lock;
+
+ char page_buffer[VDO_BLOCK_SIZE];
+};
+
+enum block_map_page_type {
+ VDO_TREE_PAGE,
+ VDO_CACHE_PAGE,
+};
+
+typedef struct list_head dirty_era_t[2];
+
+struct dirty_lists {
+ /* The number of periods after which an element will be expired */
+ block_count_t maximum_age;
+ /* The oldest period which has unexpired elements */
+ sequence_number_t oldest_period;
+ /* One more than the current period */
+ sequence_number_t next_period;
+ /* The offset in the array of lists of the oldest period */
+ block_count_t offset;
+ /* Expired pages */
+ dirty_era_t expired;
+ /* The lists of dirty pages */
+ dirty_era_t eras[];
+};
+
+struct block_map_zone {
+ zone_count_t zone_number;
+ thread_id_t thread_id;
+ struct admin_state state;
+ struct block_map *block_map;
+ /* Dirty pages, by era*/
+ struct dirty_lists *dirty_lists;
+ struct vdo_page_cache page_cache;
+ data_vio_count_t active_lookups;
+ struct int_map *loading_pages;
+ struct vio_pool *vio_pool;
+ /* The tree page which has issued or will be issuing a flush */
+ struct tree_page *flusher;
+ struct vdo_wait_queue flush_waiters;
+ /* The generation after the most recent flush */
+ u8 generation;
+ u8 oldest_generation;
+ /* The counts of dirty pages in each generation */
+ u32 dirty_page_counts[256];
+};
+
+struct block_map {
+ struct vdo *vdo;
+ struct action_manager *action_manager;
+ /* The absolute PBN of the first root of the tree part of the block map */
+ physical_block_number_t root_origin;
+ block_count_t root_count;
+
+ /* The era point we are currently distributing to the zones */
+ sequence_number_t current_era_point;
+ /* The next era point */
+ sequence_number_t pending_era_point;
+
+ /* The number of entries in block map */
+ block_count_t entry_count;
+ nonce_t nonce;
+ struct recovery_journal *journal;
+
+ /* The trees for finding block map pages */
+ struct forest *forest;
+ /* The expanded trees awaiting growth */
+ struct forest *next_forest;
+ /* The number of entries after growth */
+ block_count_t next_entry_count;
+
+ zone_count_t zone_count;
+ struct block_map_zone zones[];
+};
+
+/**
+ * typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing
+ * the forest.
+ * @pbn: A PBN of a tree node.
+ * @completion: The parent completion of the traversal.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+typedef int (*vdo_entry_callback_fn)(physical_block_number_t pbn,
+ struct vdo_completion *completion);
+
+static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion)
+{
+ vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION);
+ return container_of(completion, struct vdo_page_completion, completion);
+}
+
+void vdo_release_page_completion(struct vdo_completion *completion);
+
+void vdo_get_page(struct vdo_page_completion *page_completion,
+ struct block_map_zone *zone, physical_block_number_t pbn,
+ bool writable, void *parent, vdo_action_fn callback,
+ vdo_action_fn error_handler, bool requeue);
+
+void vdo_request_page_write(struct vdo_completion *completion);
+
+int __must_check vdo_get_cached_page(struct vdo_completion *completion,
+ struct block_map_page **page_ptr);
+
+int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache);
+
+static inline struct block_map_page * __must_check
+vdo_as_block_map_page(struct tree_page *tree_page)
+{
+ return (struct block_map_page *) tree_page->page_buffer;
+}
+
+bool vdo_copy_valid_page(char *buffer, nonce_t nonce,
+ physical_block_number_t pbn,
+ struct block_map_page *page);
+
+void vdo_find_block_map_slot(struct data_vio *data_vio);
+
+physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map,
+ page_number_t page_number);
+
+void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone);
+
+void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
+ struct vdo_completion *completion);
+
+int __must_check vdo_decode_block_map(struct block_map_state_2_0 state,
+ block_count_t logical_blocks, struct vdo *vdo,
+ struct recovery_journal *journal, nonce_t nonce,
+ page_count_t cache_size, block_count_t maximum_age,
+ struct block_map **map_ptr);
+
+void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation,
+ struct vdo_completion *parent);
+
+void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent);
+
+int __must_check vdo_prepare_to_grow_block_map(struct block_map *map,
+ block_count_t new_logical_blocks);
+
+void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent);
+
+void vdo_abandon_block_map_growth(struct block_map *map);
+
+void vdo_free_block_map(struct block_map *map);
+
+struct block_map_state_2_0 __must_check vdo_record_block_map(const struct block_map *map);
+
+void vdo_initialize_block_map_from_journal(struct block_map *map,
+ struct recovery_journal *journal);
+
+zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio);
+
+void vdo_advance_block_map_era(struct block_map *map,
+ sequence_number_t recovery_block_number);
+
+void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio,
+ physical_block_number_t pbn,
+ enum block_mapping_state mapping_state,
+ sequence_number_t *recovery_lock);
+
+void vdo_get_mapped_block(struct data_vio *data_vio);
+
+void vdo_put_mapped_block(struct data_vio *data_vio);
+
+struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map);
+
+/**
+ * vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format
+ * @age: The configured maximum age
+ *
+ * Return: The converted age
+ *
+ * In the old recovery journal format, each journal block held 311 entries, and every write bio
+ * made two entries. The old maximum age was half the usable journal length. In the new format,
+ * each block holds only 217 entries, but each bio only makes one entry. We convert the configured
+ * age so that the number of writes in a block map era is the same in the old and new formats. This
+ * keeps the bound on the amount of work required to recover the block map from the recovery
+ * journal the same across the format change. It also keeps the amortization of block map page
+ * writes to write bios the same.
+ */
+static inline block_count_t vdo_convert_maximum_age(block_count_t age)
+{
+ return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK,
+ 2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK);
+}
+
+#endif /* VDO_BLOCK_MAP_H */