summaryrefslogtreecommitdiff
path: root/drivers/md/dm-vdo/indexer/sparse-cache.c
blob: 28920167827c4198399f4ecc64de48d57155f397 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2023 Red Hat
 */

#include "sparse-cache.h"

#include <linux/cache.h>
#include <linux/delay.h>
#include <linux/dm-bufio.h>

#include "logger.h"
#include "memory-alloc.h"
#include "permassert.h"

#include "chapter-index.h"
#include "config.h"
#include "index.h"

/*
 * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a
 * specific virtual chapter is implemented as a linear search. The cache replacement policy is
 * least-recently-used (LRU). Again, the small size of the cache allows the LRU order to be
 * maintained by shifting entries in an array list.
 *
 * Changing the contents of the cache requires the coordinated participation of all zone threads
 * via the careful use of barrier messages sent to all the index zones by the triage queue worker
 * thread. The critical invariant for coordination is that the cache membership must not change
 * between updates, so that all calls to uds_sparse_cache_contains() from the zone threads must all
 * receive the same results for every virtual chapter number. To ensure that critical invariant,
 * state changes such as "that virtual chapter is no longer in the volume" and "skip searching that
 * chapter because it has had too many cache misses" are represented separately from the cache
 * membership information (the virtual chapter number).
 *
 * As a result of this invariant, we have the guarantee that every zone thread will call
 * uds_update_sparse_cache() once and exactly once to request a chapter that is not in the cache,
 * and the serialization of the barrier requests from the triage queue ensures they will all
 * request the same chapter number. This means the only synchronization we need can be provided by
 * a pair of thread barriers used only in the uds_update_sparse_cache() call, providing a critical
 * section where a single zone thread can drive the cache update while all the other zone threads
 * are known to be blocked, waiting in the second barrier. Outside that critical section, all the
 * zone threads implicitly hold a shared lock. Inside it, the thread for zone zero holds an
 * exclusive lock. No other threads may access or modify the cache entries.
 *
 * Chapter statistics must only be modified by a single thread, which is also the zone zero thread.
 * All fields that might be frequently updated by that thread are kept in separate cache-aligned
 * structures so they will not cause cache contention via "false sharing" with the fields that are
 * frequently accessed by all of the zone threads.
 *
 * The LRU order is managed independently by each zone thread, and each zone uses its own list for
 * searching and cache membership queries. The zone zero list is used to decide which chapter to
 * evict when the cache is updated, and its search list is copied to the other threads at that
 * time.
 *
 * The virtual chapter number field of the cache entry is the single field indicating whether a
 * chapter is a member of the cache or not. The value NO_CHAPTER is used to represent a null or
 * undefined chapter number. When present in the virtual chapter number field of a
 * cached_chapter_index, it indicates that the cache entry is dead, and all the other fields of
 * that entry (other than immutable pointers to cache memory) are undefined and irrelevant. Any
 * cache entry that is not marked as dead is fully defined and a member of the cache, and
 * uds_sparse_cache_contains() will always return true for any virtual chapter number that appears
 * in any of the cache entries.
 *
 * A chapter index that is a member of the cache may be excluded from searches between calls to
 * uds_update_sparse_cache() in two different ways. First, when a chapter falls off the end of the
 * volume, its virtual chapter number will be less that the oldest virtual chapter number. Since
 * that chapter is no longer part of the volume, there's no point in continuing to search that
 * chapter index. Once invalidated, that virtual chapter will still be considered a member of the
 * cache, but it will no longer be searched for matching names.
 *
 * The second mechanism is a heuristic based on keeping track of the number of consecutive search
 * misses in a given chapter index. Once that count exceeds a threshold, the skip_search flag will
 * be set to true, causing the chapter to be skipped when searching the entire cache, but still
 * allowing it to be found when searching for a hook in that specific chapter. Finding a hook will
 * clear the skip_search flag, once again allowing the non-hook searches to use that cache entry.
 * Again, regardless of the state of the skip_search flag, the virtual chapter must still
 * considered to be a member of the cache for uds_sparse_cache_contains().
 */

#define SKIP_SEARCH_THRESHOLD 20000
#define ZONE_ZERO 0

/*
 * These counters are essentially fields of the struct cached_chapter_index, but are segregated
 * into this structure because they are frequently modified. They are grouped and aligned to keep
 * them on different cache lines from the chapter fields that are accessed far more often than they
 * are updated.
 */
struct __aligned(L1_CACHE_BYTES) cached_index_counters {
	u64 consecutive_misses;
};

struct __aligned(L1_CACHE_BYTES) cached_chapter_index {
	/*
	 * The virtual chapter number of the cached chapter index. NO_CHAPTER means this cache
	 * entry is unused. This field must only be modified in the critical section in
	 * uds_update_sparse_cache().
	 */
	u64 virtual_chapter;

	u32 index_pages_count;

	/*
	 * These pointers are immutable during the life of the cache. The contents of the arrays
	 * change when the cache entry is replaced.
	 */
	struct delta_index_page *index_pages;
	struct dm_buffer **page_buffers;

	/*
	 * If set, skip the chapter when searching the entire cache. This flag is just a
	 * performance optimization. This flag is mutable between cache updates, but it rarely
	 * changes and is frequently accessed, so it groups with the immutable fields.
	 */
	bool skip_search;

	/*
	 * The cache-aligned counters change often and are placed at the end of the structure to
	 * prevent false sharing with the more stable fields above.
	 */
	struct cached_index_counters counters;
};

/*
 * A search_list represents an ordering of the sparse chapter index cache entry array, from most
 * recently accessed to least recently accessed, which is the order in which the indexes should be
 * searched and the reverse order in which they should be evicted from the cache.
 *
 * Cache entries that are dead or empty are kept at the end of the list, avoiding the need to even
 * iterate over them to search, and ensuring that dead entries are replaced before any live entries
 * are evicted.
 *
 * The search list is instantiated for each zone thread, avoiding any need for synchronization. The
 * structure is allocated on a cache boundary to avoid false sharing of memory cache lines between
 * zone threads.
 */
struct search_list {
	u8 capacity;
	u8 first_dead_entry;
	struct cached_chapter_index *entries[];
};

struct threads_barrier {
	/* Lock for this barrier object */
	struct semaphore lock;
	/* Semaphore for threads waiting at this barrier */
	struct semaphore wait;
	/* Number of threads which have arrived */
	int arrived;
	/* Total number of threads using this barrier */
	int thread_count;
};

struct sparse_cache {
	const struct index_geometry *geometry;
	unsigned int capacity;
	unsigned int zone_count;

	unsigned int skip_threshold;
	struct search_list *search_lists[MAX_ZONES];
	struct cached_chapter_index **scratch_entries;

	struct threads_barrier begin_update_barrier;
	struct threads_barrier end_update_barrier;

	struct cached_chapter_index chapters[];
};

static void initialize_threads_barrier(struct threads_barrier *barrier,
				       unsigned int thread_count)
{
	sema_init(&barrier->lock, 1);
	barrier->arrived = 0;
	barrier->thread_count = thread_count;
	sema_init(&barrier->wait, 0);
}

static inline void __down(struct semaphore *semaphore)
{
	/*
	 * Do not use down(semaphore). Instead use down_interruptible so that
	 * we do not get 120 second stall messages in kern.log.
	 */
	while (down_interruptible(semaphore) != 0) {
		/*
		 * If we're called from a user-mode process (e.g., "dmsetup
		 * remove") while waiting for an operation that may take a
		 * while (e.g., UDS index save), and a signal is sent (SIGINT,
		 * SIGUSR2), then down_interruptible will not block. If that
		 * happens, sleep briefly to avoid keeping the CPU locked up in
		 * this loop. We could just call cond_resched, but then we'd
		 * still keep consuming CPU time slices and swamp other threads
		 * trying to do computational work.
		 */
		fsleep(1000);
	}
}

static void enter_threads_barrier(struct threads_barrier *barrier)
{
	__down(&barrier->lock);
	if (++barrier->arrived == barrier->thread_count) {
		/* last thread */
		int i;

		for (i = 1; i < barrier->thread_count; i++)
			up(&barrier->wait);

		barrier->arrived = 0;
		up(&barrier->lock);
	} else {
		up(&barrier->lock);
		__down(&barrier->wait);
	}
}

static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter,
							const struct index_geometry *geometry)
{
	int result;

	chapter->virtual_chapter = NO_CHAPTER;
	chapter->index_pages_count = geometry->index_pages_per_chapter;

	result = vdo_allocate(chapter->index_pages_count, struct delta_index_page,
			      __func__, &chapter->index_pages);
	if (result != VDO_SUCCESS)
		return result;

	return vdo_allocate(chapter->index_pages_count, struct dm_buffer *,
			    "sparse index volume pages", &chapter->page_buffers);
}

static int __must_check make_search_list(struct sparse_cache *cache,
					 struct search_list **list_ptr)
{
	struct search_list *list;
	unsigned int bytes;
	u8 i;
	int result;

	bytes = (sizeof(struct search_list) +
		 (cache->capacity * sizeof(struct cached_chapter_index *)));
	result = vdo_allocate_cache_aligned(bytes, "search list", &list);
	if (result != VDO_SUCCESS)
		return result;

	list->capacity = cache->capacity;
	list->first_dead_entry = 0;

	for (i = 0; i < list->capacity; i++)
		list->entries[i] = &cache->chapters[i];

	*list_ptr = list;
	return UDS_SUCCESS;
}

int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int capacity,
			  unsigned int zone_count, struct sparse_cache **cache_ptr)
{
	int result;
	unsigned int i;
	struct sparse_cache *cache;
	unsigned int bytes;

	bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index)));
	result = vdo_allocate_cache_aligned(bytes, "sparse cache", &cache);
	if (result != VDO_SUCCESS)
		return result;

	cache->geometry = geometry;
	cache->capacity = capacity;
	cache->zone_count = zone_count;

	/*
	 * Scale down the skip threshold since the cache only counts cache misses in zone zero, but
	 * requests are being handled in all zones.
	 */
	cache->skip_threshold = (SKIP_SEARCH_THRESHOLD / zone_count);

	initialize_threads_barrier(&cache->begin_update_barrier, zone_count);
	initialize_threads_barrier(&cache->end_update_barrier, zone_count);

	for (i = 0; i < capacity; i++) {
		result = initialize_cached_chapter_index(&cache->chapters[i], geometry);
		if (result != UDS_SUCCESS)
			goto out;
	}

	for (i = 0; i < zone_count; i++) {
		result = make_search_list(cache, &cache->search_lists[i]);
		if (result != UDS_SUCCESS)
			goto out;
	}

	/* purge_search_list() needs some temporary lists for sorting. */
	result = vdo_allocate(capacity * 2, struct cached_chapter_index *,
			      "scratch entries", &cache->scratch_entries);
	if (result != VDO_SUCCESS)
		goto out;

	*cache_ptr = cache;
	return UDS_SUCCESS;
out:
	uds_free_sparse_cache(cache);
	return result;
}

static inline void set_skip_search(struct cached_chapter_index *chapter,
				   bool skip_search)
{
	/* Check before setting to reduce cache line contention. */
	if (READ_ONCE(chapter->skip_search) != skip_search)
		WRITE_ONCE(chapter->skip_search, skip_search);
}

static void score_search_hit(struct cached_chapter_index *chapter)
{
	chapter->counters.consecutive_misses = 0;
	set_skip_search(chapter, false);
}

static void score_search_miss(struct sparse_cache *cache,
			      struct cached_chapter_index *chapter)
{
	chapter->counters.consecutive_misses++;
	if (chapter->counters.consecutive_misses > cache->skip_threshold)
		set_skip_search(chapter, true);
}

static void release_cached_chapter_index(struct cached_chapter_index *chapter)
{
	unsigned int i;

	chapter->virtual_chapter = NO_CHAPTER;
	if (chapter->page_buffers == NULL)
		return;

	for (i = 0; i < chapter->index_pages_count; i++) {
		if (chapter->page_buffers[i] != NULL)
			dm_bufio_release(vdo_forget(chapter->page_buffers[i]));
	}
}

void uds_free_sparse_cache(struct sparse_cache *cache)
{
	unsigned int i;

	if (cache == NULL)
		return;

	vdo_free(cache->scratch_entries);

	for (i = 0; i < cache->zone_count; i++)
		vdo_free(cache->search_lists[i]);

	for (i = 0; i < cache->capacity; i++) {
		release_cached_chapter_index(&cache->chapters[i]);
		vdo_free(cache->chapters[i].index_pages);
		vdo_free(cache->chapters[i].page_buffers);
	}

	vdo_free(cache);
}

/*
 * Take the indicated element of the search list and move it to the start, pushing the pointers
 * previously before it back down the list.
 */
static inline void set_newest_entry(struct search_list *search_list, u8 index)
{
	struct cached_chapter_index *newest;

	if (index > 0) {
		newest = search_list->entries[index];
		memmove(&search_list->entries[1], &search_list->entries[0],
			index * sizeof(struct cached_chapter_index *));
		search_list->entries[0] = newest;
	}

	/*
	 * This function may have moved a dead chapter to the front of the list for reuse, in which
	 * case the set of dead chapters becomes smaller.
	 */
	if (search_list->first_dead_entry <= index)
		search_list->first_dead_entry++;
}

bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter,
			       unsigned int zone_number)
{
	struct search_list *search_list;
	struct cached_chapter_index *chapter;
	u8 i;

	/*
	 * The correctness of the barriers depends on the invariant that between calls to
	 * uds_update_sparse_cache(), the answers this function returns must never vary: the result
	 * for a given chapter must be identical across zones. That invariant must be maintained
	 * even if the chapter falls off the end of the volume, or if searching it is disabled
	 * because of too many search misses.
	 */
	search_list = cache->search_lists[zone_number];
	for (i = 0; i < search_list->first_dead_entry; i++) {
		chapter = search_list->entries[i];

		if (virtual_chapter == chapter->virtual_chapter) {
			if (zone_number == ZONE_ZERO)
				score_search_hit(chapter);

			set_newest_entry(search_list, i);
			return true;
		}
	}

	return false;
}

/*
 * Re-sort cache entries into three sets (active, skippable, and dead) while maintaining the LRU
 * ordering that already existed. This operation must only be called during the critical section in
 * uds_update_sparse_cache().
 */
static void purge_search_list(struct search_list *search_list,
			      struct sparse_cache *cache, u64 oldest_virtual_chapter)
{
	struct cached_chapter_index **entries;
	struct cached_chapter_index **skipped;
	struct cached_chapter_index **dead;
	struct cached_chapter_index *chapter;
	unsigned int next_alive = 0;
	unsigned int next_skipped = 0;
	unsigned int next_dead = 0;
	unsigned int i;

	entries = &search_list->entries[0];
	skipped = &cache->scratch_entries[0];
	dead = &cache->scratch_entries[search_list->capacity];

	for (i = 0; i < search_list->first_dead_entry; i++) {
		chapter = search_list->entries[i];
		if ((chapter->virtual_chapter < oldest_virtual_chapter) ||
		    (chapter->virtual_chapter == NO_CHAPTER))
			dead[next_dead++] = chapter;
		else if (chapter->skip_search)
			skipped[next_skipped++] = chapter;
		else
			entries[next_alive++] = chapter;
	}

	memcpy(&entries[next_alive], skipped,
	       next_skipped * sizeof(struct cached_chapter_index *));
	memcpy(&entries[next_alive + next_skipped], dead,
	       next_dead * sizeof(struct cached_chapter_index *));
	search_list->first_dead_entry = next_alive + next_skipped;
}

static int __must_check cache_chapter_index(struct cached_chapter_index *chapter,
					    u64 virtual_chapter,
					    const struct volume *volume)
{
	int result;

	release_cached_chapter_index(chapter);

	result = uds_read_chapter_index_from_volume(volume, virtual_chapter,
						    chapter->page_buffers,
						    chapter->index_pages);
	if (result != UDS_SUCCESS)
		return result;

	chapter->counters.consecutive_misses = 0;
	chapter->virtual_chapter = virtual_chapter;
	chapter->skip_search = false;

	return UDS_SUCCESS;
}

static inline void copy_search_list(const struct search_list *source,
				    struct search_list *target)
{
	*target = *source;
	memcpy(target->entries, source->entries,
	       source->capacity * sizeof(struct cached_chapter_index *));
}

/*
 * Update the sparse cache to contain a chapter index. This function must be called by all the zone
 * threads with the same chapter number to correctly enter the thread barriers used to synchronize
 * the cache updates.
 */
int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter)
{
	int result = UDS_SUCCESS;
	const struct uds_index *index = zone->index;
	struct sparse_cache *cache = index->volume->sparse_cache;

	if (uds_sparse_cache_contains(cache, virtual_chapter, zone->id))
		return UDS_SUCCESS;

	/*
	 * Wait for every zone thread to reach its corresponding barrier request and invoke this
	 * function before starting to modify the cache.
	 */
	enter_threads_barrier(&cache->begin_update_barrier);

	/*
	 * This is the start of the critical section: the zone zero thread is captain, effectively
	 * holding an exclusive lock on the sparse cache. All the other zone threads must do
	 * nothing between the two barriers. They will wait at the end_update_barrier again for the
	 * captain to finish the update.
	 */

	if (zone->id == ZONE_ZERO) {
		unsigned int z;
		struct search_list *list = cache->search_lists[ZONE_ZERO];

		purge_search_list(list, cache, zone->oldest_virtual_chapter);

		if (virtual_chapter >= index->oldest_virtual_chapter) {
			set_newest_entry(list, list->capacity - 1);
			result = cache_chapter_index(list->entries[0], virtual_chapter,
						     index->volume);
		}

		for (z = 1; z < cache->zone_count; z++)
			copy_search_list(list, cache->search_lists[z]);
	}

	/*
	 * This is the end of the critical section. All cache invariants must have been restored.
	 */
	enter_threads_barrier(&cache->end_update_barrier);
	return result;
}

void uds_invalidate_sparse_cache(struct sparse_cache *cache)
{
	unsigned int i;

	for (i = 0; i < cache->capacity; i++)
		release_cached_chapter_index(&cache->chapters[i]);
}

static inline bool should_skip_chapter(struct cached_chapter_index *chapter,
				       u64 oldest_chapter, u64 requested_chapter)
{
	if ((chapter->virtual_chapter == NO_CHAPTER) ||
	    (chapter->virtual_chapter < oldest_chapter))
		return true;

	if (requested_chapter != NO_CHAPTER)
		return requested_chapter != chapter->virtual_chapter;
	else
		return READ_ONCE(chapter->skip_search);
}

static int __must_check search_cached_chapter_index(struct cached_chapter_index *chapter,
						    const struct index_geometry *geometry,
						    const struct index_page_map *index_page_map,
						    const struct uds_record_name *name,
						    u16 *record_page_ptr)
{
	u32 physical_chapter =
		uds_map_to_physical_chapter(geometry, chapter->virtual_chapter);
	u32 index_page_number =
		uds_find_index_page_number(index_page_map, name, physical_chapter);
	struct delta_index_page *index_page =
		&chapter->index_pages[index_page_number];

	return uds_search_chapter_index_page(index_page, geometry, name,
					     record_page_ptr);
}

int uds_search_sparse_cache(struct index_zone *zone, const struct uds_record_name *name,
			    u64 *virtual_chapter_ptr, u16 *record_page_ptr)
{
	int result;
	struct volume *volume = zone->index->volume;
	struct sparse_cache *cache = volume->sparse_cache;
	struct cached_chapter_index *chapter;
	struct search_list *search_list;
	u8 i;
	/* Search the entire cache unless a specific chapter was requested. */
	bool search_one = (*virtual_chapter_ptr != NO_CHAPTER);

	*record_page_ptr = NO_CHAPTER_INDEX_ENTRY;
	search_list = cache->search_lists[zone->id];
	for (i = 0; i < search_list->first_dead_entry; i++) {
		chapter = search_list->entries[i];

		if (should_skip_chapter(chapter, zone->oldest_virtual_chapter,
					*virtual_chapter_ptr))
			continue;

		result = search_cached_chapter_index(chapter, cache->geometry,
						     volume->index_page_map, name,
						     record_page_ptr);
		if (result != UDS_SUCCESS)
			return result;

		if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) {
			/*
			 * In theory, this might be a false match while a true match exists in
			 * another chapter, but that's a very rare case and not worth the extra
			 * search complexity.
			 */
			set_newest_entry(search_list, i);
			if (zone->id == ZONE_ZERO)
				score_search_hit(chapter);

			*virtual_chapter_ptr = chapter->virtual_chapter;
			return UDS_SUCCESS;
		}

		if (zone->id == ZONE_ZERO)
			score_search_miss(cache, chapter);

		if (search_one)
			break;
	}

	return UDS_SUCCESS;
}