1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Copyright (C) 2009-2011 Red Hat, Inc.
4   *
5   * Author: Mikulas Patocka <mpatocka@redhat.com>
6   *
7   * This file is released under the GPL.
8   */
9  
10  #include <linux/dm-bufio.h>
11  
12  #include <linux/device-mapper.h>
13  #include <linux/dm-io.h>
14  #include <linux/slab.h>
15  #include <linux/sched/mm.h>
16  #include <linux/jiffies.h>
17  #include <linux/vmalloc.h>
18  #include <linux/shrinker.h>
19  #include <linux/module.h>
20  #include <linux/rbtree.h>
21  #include <linux/stacktrace.h>
22  #include <linux/jump_label.h>
23  
24  #include "dm.h"
25  
26  #define DM_MSG_PREFIX "bufio"
27  
28  /*
29   * Memory management policy:
30   *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
31   *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
32   *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
33   *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
34   *	dirty buffers.
35   */
36  #define DM_BUFIO_MIN_BUFFERS		8
37  
38  #define DM_BUFIO_MEMORY_PERCENT		2
39  #define DM_BUFIO_VMALLOC_PERCENT	25
40  #define DM_BUFIO_WRITEBACK_RATIO	3
41  #define DM_BUFIO_LOW_WATERMARK_RATIO	16
42  
43  /*
44   * Check buffer ages in this interval (seconds)
45   */
46  #define DM_BUFIO_WORK_TIMER_SECS	30
47  
48  /*
49   * Free buffers when they are older than this (seconds)
50   */
51  #define DM_BUFIO_DEFAULT_AGE_SECS	300
52  
53  /*
54   * The nr of bytes of cached data to keep around.
55   */
56  #define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
57  
58  /*
59   * Align buffer writes to this boundary.
60   * Tests show that SSDs have the highest IOPS when using 4k writes.
61   */
62  #define DM_BUFIO_WRITE_ALIGN		4096
63  
64  /*
65   * dm_buffer->list_mode
66   */
67  #define LIST_CLEAN	0
68  #define LIST_DIRTY	1
69  #define LIST_SIZE	2
70  
71  /*--------------------------------------------------------------*/
72  
73  /*
74   * Rather than use an LRU list, we use a clock algorithm where entries
75   * are held in a circular list.  When an entry is 'hit' a reference bit
76   * is set.  The least recently used entry is approximated by running a
77   * cursor around the list selecting unreferenced entries. Referenced
78   * entries have their reference bit cleared as the cursor passes them.
79   */
80  struct lru_entry {
81  	struct list_head list;
82  	atomic_t referenced;
83  };
84  
85  struct lru_iter {
86  	struct lru *lru;
87  	struct list_head list;
88  	struct lru_entry *stop;
89  	struct lru_entry *e;
90  };
91  
92  struct lru {
93  	struct list_head *cursor;
94  	unsigned long count;
95  
96  	struct list_head iterators;
97  };
98  
99  /*--------------*/
100  
lru_init(struct lru * lru)101  static void lru_init(struct lru *lru)
102  {
103  	lru->cursor = NULL;
104  	lru->count = 0;
105  	INIT_LIST_HEAD(&lru->iterators);
106  }
107  
lru_destroy(struct lru * lru)108  static void lru_destroy(struct lru *lru)
109  {
110  	WARN_ON_ONCE(lru->cursor);
111  	WARN_ON_ONCE(!list_empty(&lru->iterators));
112  }
113  
114  /*
115   * Insert a new entry into the lru.
116   */
lru_insert(struct lru * lru,struct lru_entry * le)117  static void lru_insert(struct lru *lru, struct lru_entry *le)
118  {
119  	/*
120  	 * Don't be tempted to set to 1, makes the lru aspect
121  	 * perform poorly.
122  	 */
123  	atomic_set(&le->referenced, 0);
124  
125  	if (lru->cursor) {
126  		list_add_tail(&le->list, lru->cursor);
127  	} else {
128  		INIT_LIST_HEAD(&le->list);
129  		lru->cursor = &le->list;
130  	}
131  	lru->count++;
132  }
133  
134  /*--------------*/
135  
136  /*
137   * Convert a list_head pointer to an lru_entry pointer.
138   */
to_le(struct list_head * l)139  static inline struct lru_entry *to_le(struct list_head *l)
140  {
141  	return container_of(l, struct lru_entry, list);
142  }
143  
144  /*
145   * Initialize an lru_iter and add it to the list of cursors in the lru.
146   */
lru_iter_begin(struct lru * lru,struct lru_iter * it)147  static void lru_iter_begin(struct lru *lru, struct lru_iter *it)
148  {
149  	it->lru = lru;
150  	it->stop = lru->cursor ? to_le(lru->cursor->prev) : NULL;
151  	it->e = lru->cursor ? to_le(lru->cursor) : NULL;
152  	list_add(&it->list, &lru->iterators);
153  }
154  
155  /*
156   * Remove an lru_iter from the list of cursors in the lru.
157   */
lru_iter_end(struct lru_iter * it)158  static inline void lru_iter_end(struct lru_iter *it)
159  {
160  	list_del(&it->list);
161  }
162  
163  /* Predicate function type to be used with lru_iter_next */
164  typedef bool (*iter_predicate)(struct lru_entry *le, void *context);
165  
166  /*
167   * Advance the cursor to the next entry that passes the
168   * predicate, and return that entry.  Returns NULL if the
169   * iteration is complete.
170   */
lru_iter_next(struct lru_iter * it,iter_predicate pred,void * context)171  static struct lru_entry *lru_iter_next(struct lru_iter *it,
172  				       iter_predicate pred, void *context)
173  {
174  	struct lru_entry *e;
175  
176  	while (it->e) {
177  		e = it->e;
178  
179  		/* advance the cursor */
180  		if (it->e == it->stop)
181  			it->e = NULL;
182  		else
183  			it->e = to_le(it->e->list.next);
184  
185  		if (pred(e, context))
186  			return e;
187  	}
188  
189  	return NULL;
190  }
191  
192  /*
193   * Invalidate a specific lru_entry and update all cursors in
194   * the lru accordingly.
195   */
lru_iter_invalidate(struct lru * lru,struct lru_entry * e)196  static void lru_iter_invalidate(struct lru *lru, struct lru_entry *e)
197  {
198  	struct lru_iter *it;
199  
200  	list_for_each_entry(it, &lru->iterators, list) {
201  		/* Move c->e forwards if necc. */
202  		if (it->e == e) {
203  			it->e = to_le(it->e->list.next);
204  			if (it->e == e)
205  				it->e = NULL;
206  		}
207  
208  		/* Move it->stop backwards if necc. */
209  		if (it->stop == e) {
210  			it->stop = to_le(it->stop->list.prev);
211  			if (it->stop == e)
212  				it->stop = NULL;
213  		}
214  	}
215  }
216  
217  /*--------------*/
218  
219  /*
220   * Remove a specific entry from the lru.
221   */
lru_remove(struct lru * lru,struct lru_entry * le)222  static void lru_remove(struct lru *lru, struct lru_entry *le)
223  {
224  	lru_iter_invalidate(lru, le);
225  	if (lru->count == 1) {
226  		lru->cursor = NULL;
227  	} else {
228  		if (lru->cursor == &le->list)
229  			lru->cursor = lru->cursor->next;
230  		list_del(&le->list);
231  	}
232  	lru->count--;
233  }
234  
235  /*
236   * Mark as referenced.
237   */
lru_reference(struct lru_entry * le)238  static inline void lru_reference(struct lru_entry *le)
239  {
240  	atomic_set(&le->referenced, 1);
241  }
242  
243  /*--------------*/
244  
245  /*
246   * Remove the least recently used entry (approx), that passes the predicate.
247   * Returns NULL on failure.
248   */
249  enum evict_result {
250  	ER_EVICT,
251  	ER_DONT_EVICT,
252  	ER_STOP, /* stop looking for something to evict */
253  };
254  
255  typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context);
256  
lru_evict(struct lru * lru,le_predicate pred,void * context,bool no_sleep)257  static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context, bool no_sleep)
258  {
259  	unsigned long tested = 0;
260  	struct list_head *h = lru->cursor;
261  	struct lru_entry *le;
262  
263  	if (!h)
264  		return NULL;
265  	/*
266  	 * In the worst case we have to loop around twice. Once to clear
267  	 * the reference flags, and then again to discover the predicate
268  	 * fails for all entries.
269  	 */
270  	while (tested < lru->count) {
271  		le = container_of(h, struct lru_entry, list);
272  
273  		if (atomic_read(&le->referenced)) {
274  			atomic_set(&le->referenced, 0);
275  		} else {
276  			tested++;
277  			switch (pred(le, context)) {
278  			case ER_EVICT:
279  				/*
280  				 * Adjust the cursor, so we start the next
281  				 * search from here.
282  				 */
283  				lru->cursor = le->list.next;
284  				lru_remove(lru, le);
285  				return le;
286  
287  			case ER_DONT_EVICT:
288  				break;
289  
290  			case ER_STOP:
291  				lru->cursor = le->list.next;
292  				return NULL;
293  			}
294  		}
295  
296  		h = h->next;
297  
298  		if (!no_sleep)
299  			cond_resched();
300  	}
301  
302  	return NULL;
303  }
304  
305  /*--------------------------------------------------------------*/
306  
307  /*
308   * Buffer state bits.
309   */
310  #define B_READING	0
311  #define B_WRITING	1
312  #define B_DIRTY		2
313  
314  /*
315   * Describes how the block was allocated:
316   * kmem_cache_alloc(), __get_free_pages() or vmalloc().
317   * See the comment at alloc_buffer_data.
318   */
319  enum data_mode {
320  	DATA_MODE_SLAB = 0,
321  	DATA_MODE_GET_FREE_PAGES = 1,
322  	DATA_MODE_VMALLOC = 2,
323  	DATA_MODE_LIMIT = 3
324  };
325  
326  struct dm_buffer {
327  	/* protected by the locks in dm_buffer_cache */
328  	struct rb_node node;
329  
330  	/* immutable, so don't need protecting */
331  	sector_t block;
332  	void *data;
333  	unsigned char data_mode;		/* DATA_MODE_* */
334  
335  	/*
336  	 * These two fields are used in isolation, so do not need
337  	 * a surrounding lock.
338  	 */
339  	atomic_t hold_count;
340  	unsigned long last_accessed;
341  
342  	/*
343  	 * Everything else is protected by the mutex in
344  	 * dm_bufio_client
345  	 */
346  	unsigned long state;
347  	struct lru_entry lru;
348  	unsigned char list_mode;		/* LIST_* */
349  	blk_status_t read_error;
350  	blk_status_t write_error;
351  	unsigned int dirty_start;
352  	unsigned int dirty_end;
353  	unsigned int write_start;
354  	unsigned int write_end;
355  	struct list_head write_list;
356  	struct dm_bufio_client *c;
357  	void (*end_io)(struct dm_buffer *b, blk_status_t bs);
358  #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
359  #define MAX_STACK 10
360  	unsigned int stack_len;
361  	unsigned long stack_entries[MAX_STACK];
362  #endif
363  };
364  
365  /*--------------------------------------------------------------*/
366  
367  /*
368   * The buffer cache manages buffers, particularly:
369   *  - inc/dec of holder count
370   *  - setting the last_accessed field
371   *  - maintains clean/dirty state along with lru
372   *  - selecting buffers that match predicates
373   *
374   * It does *not* handle:
375   *  - allocation/freeing of buffers.
376   *  - IO
377   *  - Eviction or cache sizing.
378   *
379   * cache_get() and cache_put() are threadsafe, you do not need to
380   * protect these calls with a surrounding mutex.  All the other
381   * methods are not threadsafe; they do use locking primitives, but
382   * only enough to ensure get/put are threadsafe.
383   */
384  
385  struct buffer_tree {
386  	union {
387  		struct rw_semaphore lock;
388  		rwlock_t spinlock;
389  	} u;
390  	struct rb_root root;
391  } ____cacheline_aligned_in_smp;
392  
393  struct dm_buffer_cache {
394  	struct lru lru[LIST_SIZE];
395  	/*
396  	 * We spread entries across multiple trees to reduce contention
397  	 * on the locks.
398  	 */
399  	unsigned int num_locks;
400  	bool no_sleep;
401  	struct buffer_tree trees[];
402  };
403  
404  static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
405  
cache_index(sector_t block,unsigned int num_locks)406  static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
407  {
408  	return dm_hash_locks_index(block, num_locks);
409  }
410  
cache_read_lock(struct dm_buffer_cache * bc,sector_t block)411  static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
412  {
413  	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
414  		read_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
415  	else
416  		down_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
417  }
418  
cache_read_unlock(struct dm_buffer_cache * bc,sector_t block)419  static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
420  {
421  	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
422  		read_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
423  	else
424  		up_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
425  }
426  
cache_write_lock(struct dm_buffer_cache * bc,sector_t block)427  static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
428  {
429  	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
430  		write_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
431  	else
432  		down_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
433  }
434  
cache_write_unlock(struct dm_buffer_cache * bc,sector_t block)435  static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
436  {
437  	if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
438  		write_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
439  	else
440  		up_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
441  }
442  
443  /*
444   * Sometimes we want to repeatedly get and drop locks as part of an iteration.
445   * This struct helps avoid redundant drop and gets of the same lock.
446   */
447  struct lock_history {
448  	struct dm_buffer_cache *cache;
449  	bool write;
450  	unsigned int previous;
451  	unsigned int no_previous;
452  };
453  
lh_init(struct lock_history * lh,struct dm_buffer_cache * cache,bool write)454  static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool write)
455  {
456  	lh->cache = cache;
457  	lh->write = write;
458  	lh->no_previous = cache->num_locks;
459  	lh->previous = lh->no_previous;
460  }
461  
__lh_lock(struct lock_history * lh,unsigned int index)462  static void __lh_lock(struct lock_history *lh, unsigned int index)
463  {
464  	if (lh->write) {
465  		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
466  			write_lock_bh(&lh->cache->trees[index].u.spinlock);
467  		else
468  			down_write(&lh->cache->trees[index].u.lock);
469  	} else {
470  		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
471  			read_lock_bh(&lh->cache->trees[index].u.spinlock);
472  		else
473  			down_read(&lh->cache->trees[index].u.lock);
474  	}
475  }
476  
__lh_unlock(struct lock_history * lh,unsigned int index)477  static void __lh_unlock(struct lock_history *lh, unsigned int index)
478  {
479  	if (lh->write) {
480  		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
481  			write_unlock_bh(&lh->cache->trees[index].u.spinlock);
482  		else
483  			up_write(&lh->cache->trees[index].u.lock);
484  	} else {
485  		if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
486  			read_unlock_bh(&lh->cache->trees[index].u.spinlock);
487  		else
488  			up_read(&lh->cache->trees[index].u.lock);
489  	}
490  }
491  
492  /*
493   * Make sure you call this since it will unlock the final lock.
494   */
lh_exit(struct lock_history * lh)495  static void lh_exit(struct lock_history *lh)
496  {
497  	if (lh->previous != lh->no_previous) {
498  		__lh_unlock(lh, lh->previous);
499  		lh->previous = lh->no_previous;
500  	}
501  }
502  
503  /*
504   * Named 'next' because there is no corresponding
505   * 'up/unlock' call since it's done automatically.
506   */
lh_next(struct lock_history * lh,sector_t b)507  static void lh_next(struct lock_history *lh, sector_t b)
508  {
509  	unsigned int index = cache_index(b, lh->no_previous); /* no_previous is num_locks */
510  
511  	if (lh->previous != lh->no_previous) {
512  		if (lh->previous != index) {
513  			__lh_unlock(lh, lh->previous);
514  			__lh_lock(lh, index);
515  			lh->previous = index;
516  		}
517  	} else {
518  		__lh_lock(lh, index);
519  		lh->previous = index;
520  	}
521  }
522  
le_to_buffer(struct lru_entry * le)523  static inline struct dm_buffer *le_to_buffer(struct lru_entry *le)
524  {
525  	return container_of(le, struct dm_buffer, lru);
526  }
527  
list_to_buffer(struct list_head * l)528  static struct dm_buffer *list_to_buffer(struct list_head *l)
529  {
530  	struct lru_entry *le = list_entry(l, struct lru_entry, list);
531  
532  	return le_to_buffer(le);
533  }
534  
cache_init(struct dm_buffer_cache * bc,unsigned int num_locks,bool no_sleep)535  static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks, bool no_sleep)
536  {
537  	unsigned int i;
538  
539  	bc->num_locks = num_locks;
540  	bc->no_sleep = no_sleep;
541  
542  	for (i = 0; i < bc->num_locks; i++) {
543  		if (no_sleep)
544  			rwlock_init(&bc->trees[i].u.spinlock);
545  		else
546  			init_rwsem(&bc->trees[i].u.lock);
547  		bc->trees[i].root = RB_ROOT;
548  	}
549  
550  	lru_init(&bc->lru[LIST_CLEAN]);
551  	lru_init(&bc->lru[LIST_DIRTY]);
552  }
553  
cache_destroy(struct dm_buffer_cache * bc)554  static void cache_destroy(struct dm_buffer_cache *bc)
555  {
556  	unsigned int i;
557  
558  	for (i = 0; i < bc->num_locks; i++)
559  		WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root));
560  
561  	lru_destroy(&bc->lru[LIST_CLEAN]);
562  	lru_destroy(&bc->lru[LIST_DIRTY]);
563  }
564  
565  /*--------------*/
566  
567  /*
568   * not threadsafe, or racey depending how you look at it
569   */
cache_count(struct dm_buffer_cache * bc,int list_mode)570  static inline unsigned long cache_count(struct dm_buffer_cache *bc, int list_mode)
571  {
572  	return bc->lru[list_mode].count;
573  }
574  
cache_total(struct dm_buffer_cache * bc)575  static inline unsigned long cache_total(struct dm_buffer_cache *bc)
576  {
577  	return cache_count(bc, LIST_CLEAN) + cache_count(bc, LIST_DIRTY);
578  }
579  
580  /*--------------*/
581  
582  /*
583   * Gets a specific buffer, indexed by block.
584   * If the buffer is found then its holder count will be incremented and
585   * lru_reference will be called.
586   *
587   * threadsafe
588   */
__cache_get(const struct rb_root * root,sector_t block)589  static struct dm_buffer *__cache_get(const struct rb_root *root, sector_t block)
590  {
591  	struct rb_node *n = root->rb_node;
592  	struct dm_buffer *b;
593  
594  	while (n) {
595  		b = container_of(n, struct dm_buffer, node);
596  
597  		if (b->block == block)
598  			return b;
599  
600  		n = block < b->block ? n->rb_left : n->rb_right;
601  	}
602  
603  	return NULL;
604  }
605  
__cache_inc_buffer(struct dm_buffer * b)606  static void __cache_inc_buffer(struct dm_buffer *b)
607  {
608  	atomic_inc(&b->hold_count);
609  	WRITE_ONCE(b->last_accessed, jiffies);
610  }
611  
cache_get(struct dm_buffer_cache * bc,sector_t block)612  static struct dm_buffer *cache_get(struct dm_buffer_cache *bc, sector_t block)
613  {
614  	struct dm_buffer *b;
615  
616  	cache_read_lock(bc, block);
617  	b = __cache_get(&bc->trees[cache_index(block, bc->num_locks)].root, block);
618  	if (b) {
619  		lru_reference(&b->lru);
620  		__cache_inc_buffer(b);
621  	}
622  	cache_read_unlock(bc, block);
623  
624  	return b;
625  }
626  
627  /*--------------*/
628  
629  /*
630   * Returns true if the hold count hits zero.
631   * threadsafe
632   */
cache_put(struct dm_buffer_cache * bc,struct dm_buffer * b)633  static bool cache_put(struct dm_buffer_cache *bc, struct dm_buffer *b)
634  {
635  	bool r;
636  
637  	cache_read_lock(bc, b->block);
638  	BUG_ON(!atomic_read(&b->hold_count));
639  	r = atomic_dec_and_test(&b->hold_count);
640  	cache_read_unlock(bc, b->block);
641  
642  	return r;
643  }
644  
645  /*--------------*/
646  
647  typedef enum evict_result (*b_predicate)(struct dm_buffer *, void *);
648  
649  /*
650   * Evicts a buffer based on a predicate.  The oldest buffer that
651   * matches the predicate will be selected.  In addition to the
652   * predicate the hold_count of the selected buffer will be zero.
653   */
654  struct evict_wrapper {
655  	struct lock_history *lh;
656  	b_predicate pred;
657  	void *context;
658  };
659  
660  /*
661   * Wraps the buffer predicate turning it into an lru predicate.  Adds
662   * extra test for hold_count.
663   */
__evict_pred(struct lru_entry * le,void * context)664  static enum evict_result __evict_pred(struct lru_entry *le, void *context)
665  {
666  	struct evict_wrapper *w = context;
667  	struct dm_buffer *b = le_to_buffer(le);
668  
669  	lh_next(w->lh, b->block);
670  
671  	if (atomic_read(&b->hold_count))
672  		return ER_DONT_EVICT;
673  
674  	return w->pred(b, w->context);
675  }
676  
__cache_evict(struct dm_buffer_cache * bc,int list_mode,b_predicate pred,void * context,struct lock_history * lh)677  static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode,
678  				       b_predicate pred, void *context,
679  				       struct lock_history *lh)
680  {
681  	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
682  	struct lru_entry *le;
683  	struct dm_buffer *b;
684  
685  	le = lru_evict(&bc->lru[list_mode], __evict_pred, &w, bc->no_sleep);
686  	if (!le)
687  		return NULL;
688  
689  	b = le_to_buffer(le);
690  	/* __evict_pred will have locked the appropriate tree. */
691  	rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
692  
693  	return b;
694  }
695  
cache_evict(struct dm_buffer_cache * bc,int list_mode,b_predicate pred,void * context)696  static struct dm_buffer *cache_evict(struct dm_buffer_cache *bc, int list_mode,
697  				     b_predicate pred, void *context)
698  {
699  	struct dm_buffer *b;
700  	struct lock_history lh;
701  
702  	lh_init(&lh, bc, true);
703  	b = __cache_evict(bc, list_mode, pred, context, &lh);
704  	lh_exit(&lh);
705  
706  	return b;
707  }
708  
709  /*--------------*/
710  
711  /*
712   * Mark a buffer as clean or dirty. Not threadsafe.
713   */
cache_mark(struct dm_buffer_cache * bc,struct dm_buffer * b,int list_mode)714  static void cache_mark(struct dm_buffer_cache *bc, struct dm_buffer *b, int list_mode)
715  {
716  	cache_write_lock(bc, b->block);
717  	if (list_mode != b->list_mode) {
718  		lru_remove(&bc->lru[b->list_mode], &b->lru);
719  		b->list_mode = list_mode;
720  		lru_insert(&bc->lru[b->list_mode], &b->lru);
721  	}
722  	cache_write_unlock(bc, b->block);
723  }
724  
725  /*--------------*/
726  
727  /*
728   * Runs through the lru associated with 'old_mode', if the predicate matches then
729   * it moves them to 'new_mode'.  Not threadsafe.
730   */
__cache_mark_many(struct dm_buffer_cache * bc,int old_mode,int new_mode,b_predicate pred,void * context,struct lock_history * lh)731  static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
732  			      b_predicate pred, void *context, struct lock_history *lh)
733  {
734  	struct lru_entry *le;
735  	struct dm_buffer *b;
736  	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
737  
738  	while (true) {
739  		le = lru_evict(&bc->lru[old_mode], __evict_pred, &w, bc->no_sleep);
740  		if (!le)
741  			break;
742  
743  		b = le_to_buffer(le);
744  		b->list_mode = new_mode;
745  		lru_insert(&bc->lru[b->list_mode], &b->lru);
746  	}
747  }
748  
cache_mark_many(struct dm_buffer_cache * bc,int old_mode,int new_mode,b_predicate pred,void * context)749  static void cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_mode,
750  			    b_predicate pred, void *context)
751  {
752  	struct lock_history lh;
753  
754  	lh_init(&lh, bc, true);
755  	__cache_mark_many(bc, old_mode, new_mode, pred, context, &lh);
756  	lh_exit(&lh);
757  }
758  
759  /*--------------*/
760  
761  /*
762   * Iterates through all clean or dirty entries calling a function for each
763   * entry.  The callback may terminate the iteration early.  Not threadsafe.
764   */
765  
766  /*
767   * Iterator functions should return one of these actions to indicate
768   * how the iteration should proceed.
769   */
770  enum it_action {
771  	IT_NEXT,
772  	IT_COMPLETE,
773  };
774  
775  typedef enum it_action (*iter_fn)(struct dm_buffer *b, void *context);
776  
__cache_iterate(struct dm_buffer_cache * bc,int list_mode,iter_fn fn,void * context,struct lock_history * lh)777  static void __cache_iterate(struct dm_buffer_cache *bc, int list_mode,
778  			    iter_fn fn, void *context, struct lock_history *lh)
779  {
780  	struct lru *lru = &bc->lru[list_mode];
781  	struct lru_entry *le, *first;
782  
783  	if (!lru->cursor)
784  		return;
785  
786  	first = le = to_le(lru->cursor);
787  	do {
788  		struct dm_buffer *b = le_to_buffer(le);
789  
790  		lh_next(lh, b->block);
791  
792  		switch (fn(b, context)) {
793  		case IT_NEXT:
794  			break;
795  
796  		case IT_COMPLETE:
797  			return;
798  		}
799  		cond_resched();
800  
801  		le = to_le(le->list.next);
802  	} while (le != first);
803  }
804  
cache_iterate(struct dm_buffer_cache * bc,int list_mode,iter_fn fn,void * context)805  static void cache_iterate(struct dm_buffer_cache *bc, int list_mode,
806  			  iter_fn fn, void *context)
807  {
808  	struct lock_history lh;
809  
810  	lh_init(&lh, bc, false);
811  	__cache_iterate(bc, list_mode, fn, context, &lh);
812  	lh_exit(&lh);
813  }
814  
815  /*--------------*/
816  
817  /*
818   * Passes ownership of the buffer to the cache. Returns false if the
819   * buffer was already present (in which case ownership does not pass).
820   * eg, a race with another thread.
821   *
822   * Holder count should be 1 on insertion.
823   *
824   * Not threadsafe.
825   */
__cache_insert(struct rb_root * root,struct dm_buffer * b)826  static bool __cache_insert(struct rb_root *root, struct dm_buffer *b)
827  {
828  	struct rb_node **new = &root->rb_node, *parent = NULL;
829  	struct dm_buffer *found;
830  
831  	while (*new) {
832  		found = container_of(*new, struct dm_buffer, node);
833  
834  		if (found->block == b->block)
835  			return false;
836  
837  		parent = *new;
838  		new = b->block < found->block ?
839  			&found->node.rb_left : &found->node.rb_right;
840  	}
841  
842  	rb_link_node(&b->node, parent, new);
843  	rb_insert_color(&b->node, root);
844  
845  	return true;
846  }
847  
cache_insert(struct dm_buffer_cache * bc,struct dm_buffer * b)848  static bool cache_insert(struct dm_buffer_cache *bc, struct dm_buffer *b)
849  {
850  	bool r;
851  
852  	if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE))
853  		return false;
854  
855  	cache_write_lock(bc, b->block);
856  	BUG_ON(atomic_read(&b->hold_count) != 1);
857  	r = __cache_insert(&bc->trees[cache_index(b->block, bc->num_locks)].root, b);
858  	if (r)
859  		lru_insert(&bc->lru[b->list_mode], &b->lru);
860  	cache_write_unlock(bc, b->block);
861  
862  	return r;
863  }
864  
865  /*--------------*/
866  
867  /*
868   * Removes buffer from cache, ownership of the buffer passes back to the caller.
869   * Fails if the hold_count is not one (ie. the caller holds the only reference).
870   *
871   * Not threadsafe.
872   */
cache_remove(struct dm_buffer_cache * bc,struct dm_buffer * b)873  static bool cache_remove(struct dm_buffer_cache *bc, struct dm_buffer *b)
874  {
875  	bool r;
876  
877  	cache_write_lock(bc, b->block);
878  
879  	if (atomic_read(&b->hold_count) != 1) {
880  		r = false;
881  	} else {
882  		r = true;
883  		rb_erase(&b->node, &bc->trees[cache_index(b->block, bc->num_locks)].root);
884  		lru_remove(&bc->lru[b->list_mode], &b->lru);
885  	}
886  
887  	cache_write_unlock(bc, b->block);
888  
889  	return r;
890  }
891  
892  /*--------------*/
893  
894  typedef void (*b_release)(struct dm_buffer *);
895  
__find_next(struct rb_root * root,sector_t block)896  static struct dm_buffer *__find_next(struct rb_root *root, sector_t block)
897  {
898  	struct rb_node *n = root->rb_node;
899  	struct dm_buffer *b;
900  	struct dm_buffer *best = NULL;
901  
902  	while (n) {
903  		b = container_of(n, struct dm_buffer, node);
904  
905  		if (b->block == block)
906  			return b;
907  
908  		if (block <= b->block) {
909  			n = n->rb_left;
910  			best = b;
911  		} else {
912  			n = n->rb_right;
913  		}
914  	}
915  
916  	return best;
917  }
918  
__remove_range(struct dm_buffer_cache * bc,struct rb_root * root,sector_t begin,sector_t end,b_predicate pred,b_release release)919  static void __remove_range(struct dm_buffer_cache *bc,
920  			   struct rb_root *root,
921  			   sector_t begin, sector_t end,
922  			   b_predicate pred, b_release release)
923  {
924  	struct dm_buffer *b;
925  
926  	while (true) {
927  		cond_resched();
928  
929  		b = __find_next(root, begin);
930  		if (!b || (b->block >= end))
931  			break;
932  
933  		begin = b->block + 1;
934  
935  		if (atomic_read(&b->hold_count))
936  			continue;
937  
938  		if (pred(b, NULL) == ER_EVICT) {
939  			rb_erase(&b->node, root);
940  			lru_remove(&bc->lru[b->list_mode], &b->lru);
941  			release(b);
942  		}
943  	}
944  }
945  
cache_remove_range(struct dm_buffer_cache * bc,sector_t begin,sector_t end,b_predicate pred,b_release release)946  static void cache_remove_range(struct dm_buffer_cache *bc,
947  			       sector_t begin, sector_t end,
948  			       b_predicate pred, b_release release)
949  {
950  	unsigned int i;
951  
952  	BUG_ON(bc->no_sleep);
953  	for (i = 0; i < bc->num_locks; i++) {
954  		down_write(&bc->trees[i].u.lock);
955  		__remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
956  		up_write(&bc->trees[i].u.lock);
957  	}
958  }
959  
960  /*----------------------------------------------------------------*/
961  
962  /*
963   * Linking of buffers:
964   *	All buffers are linked to buffer_cache with their node field.
965   *
966   *	Clean buffers that are not being written (B_WRITING not set)
967   *	are linked to lru[LIST_CLEAN] with their lru_list field.
968   *
969   *	Dirty and clean buffers that are being written are linked to
970   *	lru[LIST_DIRTY] with their lru_list field. When the write
971   *	finishes, the buffer cannot be relinked immediately (because we
972   *	are in an interrupt context and relinking requires process
973   *	context), so some clean-not-writing buffers can be held on
974   *	dirty_lru too.  They are later added to lru in the process
975   *	context.
976   */
977  struct dm_bufio_client {
978  	struct block_device *bdev;
979  	unsigned int block_size;
980  	s8 sectors_per_block_bits;
981  
982  	bool no_sleep;
983  	struct mutex lock;
984  	spinlock_t spinlock;
985  
986  	int async_write_error;
987  
988  	void (*alloc_callback)(struct dm_buffer *buf);
989  	void (*write_callback)(struct dm_buffer *buf);
990  	struct kmem_cache *slab_buffer;
991  	struct kmem_cache *slab_cache;
992  	struct dm_io_client *dm_io;
993  
994  	struct list_head reserved_buffers;
995  	unsigned int need_reserved_buffers;
996  
997  	unsigned int minimum_buffers;
998  
999  	sector_t start;
1000  
1001  	struct shrinker *shrinker;
1002  	struct work_struct shrink_work;
1003  	atomic_long_t need_shrink;
1004  
1005  	wait_queue_head_t free_buffer_wait;
1006  
1007  	struct list_head client_list;
1008  
1009  	/*
1010  	 * Used by global_cleanup to sort the clients list.
1011  	 */
1012  	unsigned long oldest_buffer;
1013  
1014  	struct dm_buffer_cache cache; /* must be last member */
1015  };
1016  
1017  /*----------------------------------------------------------------*/
1018  
1019  #define dm_bufio_in_request()	(!!current->bio_list)
1020  
dm_bufio_lock(struct dm_bufio_client * c)1021  static void dm_bufio_lock(struct dm_bufio_client *c)
1022  {
1023  	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1024  		spin_lock_bh(&c->spinlock);
1025  	else
1026  		mutex_lock_nested(&c->lock, dm_bufio_in_request());
1027  }
1028  
dm_bufio_unlock(struct dm_bufio_client * c)1029  static void dm_bufio_unlock(struct dm_bufio_client *c)
1030  {
1031  	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1032  		spin_unlock_bh(&c->spinlock);
1033  	else
1034  		mutex_unlock(&c->lock);
1035  }
1036  
1037  /*----------------------------------------------------------------*/
1038  
1039  /*
1040   * Default cache size: available memory divided by the ratio.
1041   */
1042  static unsigned long dm_bufio_default_cache_size;
1043  
1044  /*
1045   * Total cache size set by the user.
1046   */
1047  static unsigned long dm_bufio_cache_size;
1048  
1049  /*
1050   * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
1051   * at any time.  If it disagrees, the user has changed cache size.
1052   */
1053  static unsigned long dm_bufio_cache_size_latch;
1054  
1055  static DEFINE_SPINLOCK(global_spinlock);
1056  
1057  /*
1058   * Buffers are freed after this timeout
1059   */
1060  static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
1061  static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
1062  
1063  static unsigned long dm_bufio_peak_allocated;
1064  static unsigned long dm_bufio_allocated_kmem_cache;
1065  static unsigned long dm_bufio_allocated_get_free_pages;
1066  static unsigned long dm_bufio_allocated_vmalloc;
1067  static unsigned long dm_bufio_current_allocated;
1068  
1069  /*----------------------------------------------------------------*/
1070  
1071  /*
1072   * The current number of clients.
1073   */
1074  static int dm_bufio_client_count;
1075  
1076  /*
1077   * The list of all clients.
1078   */
1079  static LIST_HEAD(dm_bufio_all_clients);
1080  
1081  /*
1082   * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
1083   */
1084  static DEFINE_MUTEX(dm_bufio_clients_lock);
1085  
1086  static struct workqueue_struct *dm_bufio_wq;
1087  static struct delayed_work dm_bufio_cleanup_old_work;
1088  static struct work_struct dm_bufio_replacement_work;
1089  
1090  
1091  #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
buffer_record_stack(struct dm_buffer * b)1092  static void buffer_record_stack(struct dm_buffer *b)
1093  {
1094  	b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
1095  }
1096  #endif
1097  
1098  /*----------------------------------------------------------------*/
1099  
adjust_total_allocated(struct dm_buffer * b,bool unlink)1100  static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
1101  {
1102  	unsigned char data_mode;
1103  	long diff;
1104  
1105  	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
1106  		&dm_bufio_allocated_kmem_cache,
1107  		&dm_bufio_allocated_get_free_pages,
1108  		&dm_bufio_allocated_vmalloc,
1109  	};
1110  
1111  	data_mode = b->data_mode;
1112  	diff = (long)b->c->block_size;
1113  	if (unlink)
1114  		diff = -diff;
1115  
1116  	spin_lock(&global_spinlock);
1117  
1118  	*class_ptr[data_mode] += diff;
1119  
1120  	dm_bufio_current_allocated += diff;
1121  
1122  	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
1123  		dm_bufio_peak_allocated = dm_bufio_current_allocated;
1124  
1125  	if (!unlink) {
1126  		if (dm_bufio_current_allocated > dm_bufio_cache_size)
1127  			queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
1128  	}
1129  
1130  	spin_unlock(&global_spinlock);
1131  }
1132  
1133  /*
1134   * Change the number of clients and recalculate per-client limit.
1135   */
__cache_size_refresh(void)1136  static void __cache_size_refresh(void)
1137  {
1138  	if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock)))
1139  		return;
1140  	if (WARN_ON(dm_bufio_client_count < 0))
1141  		return;
1142  
1143  	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
1144  
1145  	/*
1146  	 * Use default if set to 0 and report the actual cache size used.
1147  	 */
1148  	if (!dm_bufio_cache_size_latch) {
1149  		(void)cmpxchg(&dm_bufio_cache_size, 0,
1150  			      dm_bufio_default_cache_size);
1151  		dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
1152  	}
1153  }
1154  
1155  /*
1156   * Allocating buffer data.
1157   *
1158   * Small buffers are allocated with kmem_cache, to use space optimally.
1159   *
1160   * For large buffers, we choose between get_free_pages and vmalloc.
1161   * Each has advantages and disadvantages.
1162   *
1163   * __get_free_pages can randomly fail if the memory is fragmented.
1164   * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
1165   * as low as 128M) so using it for caching is not appropriate.
1166   *
1167   * If the allocation may fail we use __get_free_pages. Memory fragmentation
1168   * won't have a fatal effect here, but it just causes flushes of some other
1169   * buffers and more I/O will be performed. Don't use __get_free_pages if it
1170   * always fails (i.e. order > MAX_PAGE_ORDER).
1171   *
1172   * If the allocation shouldn't fail we use __vmalloc. This is only for the
1173   * initial reserve allocation, so there's no risk of wasting all vmalloc
1174   * space.
1175   */
alloc_buffer_data(struct dm_bufio_client * c,gfp_t gfp_mask,unsigned char * data_mode)1176  static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
1177  			       unsigned char *data_mode)
1178  {
1179  	if (unlikely(c->slab_cache != NULL)) {
1180  		*data_mode = DATA_MODE_SLAB;
1181  		return kmem_cache_alloc(c->slab_cache, gfp_mask);
1182  	}
1183  
1184  	if (c->block_size <= KMALLOC_MAX_SIZE &&
1185  	    gfp_mask & __GFP_NORETRY) {
1186  		*data_mode = DATA_MODE_GET_FREE_PAGES;
1187  		return (void *)__get_free_pages(gfp_mask,
1188  						c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1189  	}
1190  
1191  	*data_mode = DATA_MODE_VMALLOC;
1192  
1193  	return __vmalloc(c->block_size, gfp_mask);
1194  }
1195  
1196  /*
1197   * Free buffer's data.
1198   */
free_buffer_data(struct dm_bufio_client * c,void * data,unsigned char data_mode)1199  static void free_buffer_data(struct dm_bufio_client *c,
1200  			     void *data, unsigned char data_mode)
1201  {
1202  	switch (data_mode) {
1203  	case DATA_MODE_SLAB:
1204  		kmem_cache_free(c->slab_cache, data);
1205  		break;
1206  
1207  	case DATA_MODE_GET_FREE_PAGES:
1208  		free_pages((unsigned long)data,
1209  			   c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1210  		break;
1211  
1212  	case DATA_MODE_VMALLOC:
1213  		vfree(data);
1214  		break;
1215  
1216  	default:
1217  		DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
1218  		       data_mode);
1219  		BUG();
1220  	}
1221  }
1222  
1223  /*
1224   * Allocate buffer and its data.
1225   */
alloc_buffer(struct dm_bufio_client * c,gfp_t gfp_mask)1226  static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
1227  {
1228  	struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
1229  
1230  	if (!b)
1231  		return NULL;
1232  
1233  	b->c = c;
1234  
1235  	b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
1236  	if (!b->data) {
1237  		kmem_cache_free(c->slab_buffer, b);
1238  		return NULL;
1239  	}
1240  	adjust_total_allocated(b, false);
1241  
1242  #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1243  	b->stack_len = 0;
1244  #endif
1245  	return b;
1246  }
1247  
1248  /*
1249   * Free buffer and its data.
1250   */
free_buffer(struct dm_buffer * b)1251  static void free_buffer(struct dm_buffer *b)
1252  {
1253  	struct dm_bufio_client *c = b->c;
1254  
1255  	adjust_total_allocated(b, true);
1256  	free_buffer_data(c, b->data, b->data_mode);
1257  	kmem_cache_free(c->slab_buffer, b);
1258  }
1259  
1260  /*
1261   *--------------------------------------------------------------------------
1262   * Submit I/O on the buffer.
1263   *
1264   * Bio interface is faster but it has some problems:
1265   *	the vector list is limited (increasing this limit increases
1266   *	memory-consumption per buffer, so it is not viable);
1267   *
1268   *	the memory must be direct-mapped, not vmalloced;
1269   *
1270   * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
1271   * it is not vmalloced, try using the bio interface.
1272   *
1273   * If the buffer is big, if it is vmalloced or if the underlying device
1274   * rejects the bio because it is too large, use dm-io layer to do the I/O.
1275   * The dm-io layer splits the I/O into multiple requests, avoiding the above
1276   * shortcomings.
1277   *--------------------------------------------------------------------------
1278   */
1279  
1280  /*
1281   * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
1282   * that the request was handled directly with bio interface.
1283   */
dmio_complete(unsigned long error,void * context)1284  static void dmio_complete(unsigned long error, void *context)
1285  {
1286  	struct dm_buffer *b = context;
1287  
1288  	b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
1289  }
1290  
use_dmio(struct dm_buffer * b,enum req_op op,sector_t sector,unsigned int n_sectors,unsigned int offset,unsigned short ioprio)1291  static void use_dmio(struct dm_buffer *b, enum req_op op, sector_t sector,
1292  		     unsigned int n_sectors, unsigned int offset,
1293  		     unsigned short ioprio)
1294  {
1295  	int r;
1296  	struct dm_io_request io_req = {
1297  		.bi_opf = op,
1298  		.notify.fn = dmio_complete,
1299  		.notify.context = b,
1300  		.client = b->c->dm_io,
1301  	};
1302  	struct dm_io_region region = {
1303  		.bdev = b->c->bdev,
1304  		.sector = sector,
1305  		.count = n_sectors,
1306  	};
1307  
1308  	if (b->data_mode != DATA_MODE_VMALLOC) {
1309  		io_req.mem.type = DM_IO_KMEM;
1310  		io_req.mem.ptr.addr = (char *)b->data + offset;
1311  	} else {
1312  		io_req.mem.type = DM_IO_VMA;
1313  		io_req.mem.ptr.vma = (char *)b->data + offset;
1314  	}
1315  
1316  	r = dm_io(&io_req, 1, &region, NULL, ioprio);
1317  	if (unlikely(r))
1318  		b->end_io(b, errno_to_blk_status(r));
1319  }
1320  
bio_complete(struct bio * bio)1321  static void bio_complete(struct bio *bio)
1322  {
1323  	struct dm_buffer *b = bio->bi_private;
1324  	blk_status_t status = bio->bi_status;
1325  
1326  	bio_uninit(bio);
1327  	kfree(bio);
1328  	b->end_io(b, status);
1329  }
1330  
use_bio(struct dm_buffer * b,enum req_op op,sector_t sector,unsigned int n_sectors,unsigned int offset,unsigned short ioprio)1331  static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
1332  		    unsigned int n_sectors, unsigned int offset,
1333  		    unsigned short ioprio)
1334  {
1335  	struct bio *bio;
1336  	char *ptr;
1337  	unsigned int len;
1338  
1339  	bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
1340  	if (!bio) {
1341  		use_dmio(b, op, sector, n_sectors, offset, ioprio);
1342  		return;
1343  	}
1344  	bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op);
1345  	bio->bi_iter.bi_sector = sector;
1346  	bio->bi_end_io = bio_complete;
1347  	bio->bi_private = b;
1348  	bio->bi_ioprio = ioprio;
1349  
1350  	ptr = (char *)b->data + offset;
1351  	len = n_sectors << SECTOR_SHIFT;
1352  
1353  	__bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
1354  
1355  	submit_bio(bio);
1356  }
1357  
block_to_sector(struct dm_bufio_client * c,sector_t block)1358  static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
1359  {
1360  	sector_t sector;
1361  
1362  	if (likely(c->sectors_per_block_bits >= 0))
1363  		sector = block << c->sectors_per_block_bits;
1364  	else
1365  		sector = block * (c->block_size >> SECTOR_SHIFT);
1366  	sector += c->start;
1367  
1368  	return sector;
1369  }
1370  
submit_io(struct dm_buffer * b,enum req_op op,unsigned short ioprio,void (* end_io)(struct dm_buffer *,blk_status_t))1371  static void submit_io(struct dm_buffer *b, enum req_op op, unsigned short ioprio,
1372  		      void (*end_io)(struct dm_buffer *, blk_status_t))
1373  {
1374  	unsigned int n_sectors;
1375  	sector_t sector;
1376  	unsigned int offset, end;
1377  
1378  	b->end_io = end_io;
1379  
1380  	sector = block_to_sector(b->c, b->block);
1381  
1382  	if (op != REQ_OP_WRITE) {
1383  		n_sectors = b->c->block_size >> SECTOR_SHIFT;
1384  		offset = 0;
1385  	} else {
1386  		if (b->c->write_callback)
1387  			b->c->write_callback(b);
1388  		offset = b->write_start;
1389  		end = b->write_end;
1390  		offset &= -DM_BUFIO_WRITE_ALIGN;
1391  		end += DM_BUFIO_WRITE_ALIGN - 1;
1392  		end &= -DM_BUFIO_WRITE_ALIGN;
1393  		if (unlikely(end > b->c->block_size))
1394  			end = b->c->block_size;
1395  
1396  		sector += offset >> SECTOR_SHIFT;
1397  		n_sectors = (end - offset) >> SECTOR_SHIFT;
1398  	}
1399  
1400  	if (b->data_mode != DATA_MODE_VMALLOC)
1401  		use_bio(b, op, sector, n_sectors, offset, ioprio);
1402  	else
1403  		use_dmio(b, op, sector, n_sectors, offset, ioprio);
1404  }
1405  
1406  /*
1407   *--------------------------------------------------------------
1408   * Writing dirty buffers
1409   *--------------------------------------------------------------
1410   */
1411  
1412  /*
1413   * The endio routine for write.
1414   *
1415   * Set the error, clear B_WRITING bit and wake anyone who was waiting on
1416   * it.
1417   */
write_endio(struct dm_buffer * b,blk_status_t status)1418  static void write_endio(struct dm_buffer *b, blk_status_t status)
1419  {
1420  	b->write_error = status;
1421  	if (unlikely(status)) {
1422  		struct dm_bufio_client *c = b->c;
1423  
1424  		(void)cmpxchg(&c->async_write_error, 0,
1425  				blk_status_to_errno(status));
1426  	}
1427  
1428  	BUG_ON(!test_bit(B_WRITING, &b->state));
1429  
1430  	smp_mb__before_atomic();
1431  	clear_bit(B_WRITING, &b->state);
1432  	smp_mb__after_atomic();
1433  
1434  	wake_up_bit(&b->state, B_WRITING);
1435  }
1436  
1437  /*
1438   * Initiate a write on a dirty buffer, but don't wait for it.
1439   *
1440   * - If the buffer is not dirty, exit.
1441   * - If there some previous write going on, wait for it to finish (we can't
1442   *   have two writes on the same buffer simultaneously).
1443   * - Submit our write and don't wait on it. We set B_WRITING indicating
1444   *   that there is a write in progress.
1445   */
__write_dirty_buffer(struct dm_buffer * b,struct list_head * write_list)1446  static void __write_dirty_buffer(struct dm_buffer *b,
1447  				 struct list_head *write_list)
1448  {
1449  	if (!test_bit(B_DIRTY, &b->state))
1450  		return;
1451  
1452  	clear_bit(B_DIRTY, &b->state);
1453  	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1454  
1455  	b->write_start = b->dirty_start;
1456  	b->write_end = b->dirty_end;
1457  
1458  	if (!write_list)
1459  		submit_io(b, REQ_OP_WRITE, IOPRIO_DEFAULT, write_endio);
1460  	else
1461  		list_add_tail(&b->write_list, write_list);
1462  }
1463  
__flush_write_list(struct list_head * write_list)1464  static void __flush_write_list(struct list_head *write_list)
1465  {
1466  	struct blk_plug plug;
1467  
1468  	blk_start_plug(&plug);
1469  	while (!list_empty(write_list)) {
1470  		struct dm_buffer *b =
1471  			list_entry(write_list->next, struct dm_buffer, write_list);
1472  		list_del(&b->write_list);
1473  		submit_io(b, REQ_OP_WRITE, IOPRIO_DEFAULT, write_endio);
1474  		cond_resched();
1475  	}
1476  	blk_finish_plug(&plug);
1477  }
1478  
1479  /*
1480   * Wait until any activity on the buffer finishes.  Possibly write the
1481   * buffer if it is dirty.  When this function finishes, there is no I/O
1482   * running on the buffer and the buffer is not dirty.
1483   */
__make_buffer_clean(struct dm_buffer * b)1484  static void __make_buffer_clean(struct dm_buffer *b)
1485  {
1486  	BUG_ON(atomic_read(&b->hold_count));
1487  
1488  	/* smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() */
1489  	if (!smp_load_acquire(&b->state))	/* fast case */
1490  		return;
1491  
1492  	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1493  	__write_dirty_buffer(b, NULL);
1494  	wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1495  }
1496  
is_clean(struct dm_buffer * b,void * context)1497  static enum evict_result is_clean(struct dm_buffer *b, void *context)
1498  {
1499  	struct dm_bufio_client *c = context;
1500  
1501  	/* These should never happen */
1502  	if (WARN_ON_ONCE(test_bit(B_WRITING, &b->state)))
1503  		return ER_DONT_EVICT;
1504  	if (WARN_ON_ONCE(test_bit(B_DIRTY, &b->state)))
1505  		return ER_DONT_EVICT;
1506  	if (WARN_ON_ONCE(b->list_mode != LIST_CLEAN))
1507  		return ER_DONT_EVICT;
1508  
1509  	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
1510  	    unlikely(test_bit(B_READING, &b->state)))
1511  		return ER_DONT_EVICT;
1512  
1513  	return ER_EVICT;
1514  }
1515  
is_dirty(struct dm_buffer * b,void * context)1516  static enum evict_result is_dirty(struct dm_buffer *b, void *context)
1517  {
1518  	/* These should never happen */
1519  	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1520  		return ER_DONT_EVICT;
1521  	if (WARN_ON_ONCE(b->list_mode != LIST_DIRTY))
1522  		return ER_DONT_EVICT;
1523  
1524  	return ER_EVICT;
1525  }
1526  
1527  /*
1528   * Find some buffer that is not held by anybody, clean it, unlink it and
1529   * return it.
1530   */
__get_unclaimed_buffer(struct dm_bufio_client * c)1531  static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
1532  {
1533  	struct dm_buffer *b;
1534  
1535  	b = cache_evict(&c->cache, LIST_CLEAN, is_clean, c);
1536  	if (b) {
1537  		/* this also waits for pending reads */
1538  		__make_buffer_clean(b);
1539  		return b;
1540  	}
1541  
1542  	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1543  		return NULL;
1544  
1545  	b = cache_evict(&c->cache, LIST_DIRTY, is_dirty, NULL);
1546  	if (b) {
1547  		__make_buffer_clean(b);
1548  		return b;
1549  	}
1550  
1551  	return NULL;
1552  }
1553  
1554  /*
1555   * Wait until some other threads free some buffer or release hold count on
1556   * some buffer.
1557   *
1558   * This function is entered with c->lock held, drops it and regains it
1559   * before exiting.
1560   */
__wait_for_free_buffer(struct dm_bufio_client * c)1561  static void __wait_for_free_buffer(struct dm_bufio_client *c)
1562  {
1563  	DECLARE_WAITQUEUE(wait, current);
1564  
1565  	add_wait_queue(&c->free_buffer_wait, &wait);
1566  	set_current_state(TASK_UNINTERRUPTIBLE);
1567  	dm_bufio_unlock(c);
1568  
1569  	/*
1570  	 * It's possible to miss a wake up event since we don't always
1571  	 * hold c->lock when wake_up is called.  So we have a timeout here,
1572  	 * just in case.
1573  	 */
1574  	io_schedule_timeout(5 * HZ);
1575  
1576  	remove_wait_queue(&c->free_buffer_wait, &wait);
1577  
1578  	dm_bufio_lock(c);
1579  }
1580  
1581  enum new_flag {
1582  	NF_FRESH = 0,
1583  	NF_READ = 1,
1584  	NF_GET = 2,
1585  	NF_PREFETCH = 3
1586  };
1587  
1588  /*
1589   * Allocate a new buffer. If the allocation is not possible, wait until
1590   * some other thread frees a buffer.
1591   *
1592   * May drop the lock and regain it.
1593   */
__alloc_buffer_wait_no_callback(struct dm_bufio_client * c,enum new_flag nf)1594  static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
1595  {
1596  	struct dm_buffer *b;
1597  	bool tried_noio_alloc = false;
1598  
1599  	/*
1600  	 * dm-bufio is resistant to allocation failures (it just keeps
1601  	 * one buffer reserved in cases all the allocations fail).
1602  	 * So set flags to not try too hard:
1603  	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
1604  	 *		    mutex and wait ourselves.
1605  	 *	__GFP_NORETRY: don't retry and rather return failure
1606  	 *	__GFP_NOMEMALLOC: don't use emergency reserves
1607  	 *	__GFP_NOWARN: don't print a warning in case of failure
1608  	 *
1609  	 * For debugging, if we set the cache size to 1, no new buffers will
1610  	 * be allocated.
1611  	 */
1612  	while (1) {
1613  		if (dm_bufio_cache_size_latch != 1) {
1614  			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1615  			if (b)
1616  				return b;
1617  		}
1618  
1619  		if (nf == NF_PREFETCH)
1620  			return NULL;
1621  
1622  		if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
1623  			dm_bufio_unlock(c);
1624  			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1625  			dm_bufio_lock(c);
1626  			if (b)
1627  				return b;
1628  			tried_noio_alloc = true;
1629  		}
1630  
1631  		if (!list_empty(&c->reserved_buffers)) {
1632  			b = list_to_buffer(c->reserved_buffers.next);
1633  			list_del(&b->lru.list);
1634  			c->need_reserved_buffers++;
1635  
1636  			return b;
1637  		}
1638  
1639  		b = __get_unclaimed_buffer(c);
1640  		if (b)
1641  			return b;
1642  
1643  		__wait_for_free_buffer(c);
1644  	}
1645  }
1646  
__alloc_buffer_wait(struct dm_bufio_client * c,enum new_flag nf)1647  static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
1648  {
1649  	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
1650  
1651  	if (!b)
1652  		return NULL;
1653  
1654  	if (c->alloc_callback)
1655  		c->alloc_callback(b);
1656  
1657  	return b;
1658  }
1659  
1660  /*
1661   * Free a buffer and wake other threads waiting for free buffers.
1662   */
__free_buffer_wake(struct dm_buffer * b)1663  static void __free_buffer_wake(struct dm_buffer *b)
1664  {
1665  	struct dm_bufio_client *c = b->c;
1666  
1667  	b->block = -1;
1668  	if (!c->need_reserved_buffers)
1669  		free_buffer(b);
1670  	else {
1671  		list_add(&b->lru.list, &c->reserved_buffers);
1672  		c->need_reserved_buffers--;
1673  	}
1674  
1675  	/*
1676  	 * We hold the bufio lock here, so no one can add entries to the
1677  	 * wait queue anyway.
1678  	 */
1679  	if (unlikely(waitqueue_active(&c->free_buffer_wait)))
1680  		wake_up(&c->free_buffer_wait);
1681  }
1682  
cleaned(struct dm_buffer * b,void * context)1683  static enum evict_result cleaned(struct dm_buffer *b, void *context)
1684  {
1685  	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1686  		return ER_DONT_EVICT; /* should never happen */
1687  
1688  	if (test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state))
1689  		return ER_DONT_EVICT;
1690  	else
1691  		return ER_EVICT;
1692  }
1693  
__move_clean_buffers(struct dm_bufio_client * c)1694  static void __move_clean_buffers(struct dm_bufio_client *c)
1695  {
1696  	cache_mark_many(&c->cache, LIST_DIRTY, LIST_CLEAN, cleaned, NULL);
1697  }
1698  
1699  struct write_context {
1700  	int no_wait;
1701  	struct list_head *write_list;
1702  };
1703  
write_one(struct dm_buffer * b,void * context)1704  static enum it_action write_one(struct dm_buffer *b, void *context)
1705  {
1706  	struct write_context *wc = context;
1707  
1708  	if (wc->no_wait && test_bit(B_WRITING, &b->state))
1709  		return IT_COMPLETE;
1710  
1711  	__write_dirty_buffer(b, wc->write_list);
1712  	return IT_NEXT;
1713  }
1714  
__write_dirty_buffers_async(struct dm_bufio_client * c,int no_wait,struct list_head * write_list)1715  static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
1716  					struct list_head *write_list)
1717  {
1718  	struct write_context wc = {.no_wait = no_wait, .write_list = write_list};
1719  
1720  	__move_clean_buffers(c);
1721  	cache_iterate(&c->cache, LIST_DIRTY, write_one, &wc);
1722  }
1723  
1724  /*
1725   * Check if we're over watermark.
1726   * If we are over threshold_buffers, start freeing buffers.
1727   * If we're over "limit_buffers", block until we get under the limit.
1728   */
__check_watermark(struct dm_bufio_client * c,struct list_head * write_list)1729  static void __check_watermark(struct dm_bufio_client *c,
1730  			      struct list_head *write_list)
1731  {
1732  	if (cache_count(&c->cache, LIST_DIRTY) >
1733  	    cache_count(&c->cache, LIST_CLEAN) * DM_BUFIO_WRITEBACK_RATIO)
1734  		__write_dirty_buffers_async(c, 1, write_list);
1735  }
1736  
1737  /*
1738   *--------------------------------------------------------------
1739   * Getting a buffer
1740   *--------------------------------------------------------------
1741   */
1742  
cache_put_and_wake(struct dm_bufio_client * c,struct dm_buffer * b)1743  static void cache_put_and_wake(struct dm_bufio_client *c, struct dm_buffer *b)
1744  {
1745  	/*
1746  	 * Relying on waitqueue_active() is racey, but we sleep
1747  	 * with schedule_timeout anyway.
1748  	 */
1749  	if (cache_put(&c->cache, b) &&
1750  	    unlikely(waitqueue_active(&c->free_buffer_wait)))
1751  		wake_up(&c->free_buffer_wait);
1752  }
1753  
1754  /*
1755   * This assumes you have already checked the cache to see if the buffer
1756   * is already present (it will recheck after dropping the lock for allocation).
1757   */
__bufio_new(struct dm_bufio_client * c,sector_t block,enum new_flag nf,int * need_submit,struct list_head * write_list)1758  static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
1759  				     enum new_flag nf, int *need_submit,
1760  				     struct list_head *write_list)
1761  {
1762  	struct dm_buffer *b, *new_b = NULL;
1763  
1764  	*need_submit = 0;
1765  
1766  	/* This can't be called with NF_GET */
1767  	if (WARN_ON_ONCE(nf == NF_GET))
1768  		return NULL;
1769  
1770  	new_b = __alloc_buffer_wait(c, nf);
1771  	if (!new_b)
1772  		return NULL;
1773  
1774  	/*
1775  	 * We've had a period where the mutex was unlocked, so need to
1776  	 * recheck the buffer tree.
1777  	 */
1778  	b = cache_get(&c->cache, block);
1779  	if (b) {
1780  		__free_buffer_wake(new_b);
1781  		goto found_buffer;
1782  	}
1783  
1784  	__check_watermark(c, write_list);
1785  
1786  	b = new_b;
1787  	atomic_set(&b->hold_count, 1);
1788  	WRITE_ONCE(b->last_accessed, jiffies);
1789  	b->block = block;
1790  	b->read_error = 0;
1791  	b->write_error = 0;
1792  	b->list_mode = LIST_CLEAN;
1793  
1794  	if (nf == NF_FRESH)
1795  		b->state = 0;
1796  	else {
1797  		b->state = 1 << B_READING;
1798  		*need_submit = 1;
1799  	}
1800  
1801  	/*
1802  	 * We mustn't insert into the cache until the B_READING state
1803  	 * is set.  Otherwise another thread could get it and use
1804  	 * it before it had been read.
1805  	 */
1806  	cache_insert(&c->cache, b);
1807  
1808  	return b;
1809  
1810  found_buffer:
1811  	if (nf == NF_PREFETCH) {
1812  		cache_put_and_wake(c, b);
1813  		return NULL;
1814  	}
1815  
1816  	/*
1817  	 * Note: it is essential that we don't wait for the buffer to be
1818  	 * read if dm_bufio_get function is used. Both dm_bufio_get and
1819  	 * dm_bufio_prefetch can be used in the driver request routine.
1820  	 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1821  	 * the same buffer, it would deadlock if we waited.
1822  	 */
1823  	if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1824  		cache_put_and_wake(c, b);
1825  		return NULL;
1826  	}
1827  
1828  	return b;
1829  }
1830  
1831  /*
1832   * The endio routine for reading: set the error, clear the bit and wake up
1833   * anyone waiting on the buffer.
1834   */
read_endio(struct dm_buffer * b,blk_status_t status)1835  static void read_endio(struct dm_buffer *b, blk_status_t status)
1836  {
1837  	b->read_error = status;
1838  
1839  	BUG_ON(!test_bit(B_READING, &b->state));
1840  
1841  	smp_mb__before_atomic();
1842  	clear_bit(B_READING, &b->state);
1843  	smp_mb__after_atomic();
1844  
1845  	wake_up_bit(&b->state, B_READING);
1846  }
1847  
1848  /*
1849   * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1850   * functions is similar except that dm_bufio_new doesn't read the
1851   * buffer from the disk (assuming that the caller overwrites all the data
1852   * and uses dm_bufio_mark_buffer_dirty to write new data back).
1853   */
new_read(struct dm_bufio_client * c,sector_t block,enum new_flag nf,struct dm_buffer ** bp,unsigned short ioprio)1854  static void *new_read(struct dm_bufio_client *c, sector_t block,
1855  		      enum new_flag nf, struct dm_buffer **bp,
1856  		      unsigned short ioprio)
1857  {
1858  	int need_submit = 0;
1859  	struct dm_buffer *b;
1860  
1861  	LIST_HEAD(write_list);
1862  
1863  	*bp = NULL;
1864  
1865  	/*
1866  	 * Fast path, hopefully the block is already in the cache.  No need
1867  	 * to get the client lock for this.
1868  	 */
1869  	b = cache_get(&c->cache, block);
1870  	if (b) {
1871  		if (nf == NF_PREFETCH) {
1872  			cache_put_and_wake(c, b);
1873  			return NULL;
1874  		}
1875  
1876  		/*
1877  		 * Note: it is essential that we don't wait for the buffer to be
1878  		 * read if dm_bufio_get function is used. Both dm_bufio_get and
1879  		 * dm_bufio_prefetch can be used in the driver request routine.
1880  		 * If the user called both dm_bufio_prefetch and dm_bufio_get on
1881  		 * the same buffer, it would deadlock if we waited.
1882  		 */
1883  		if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1884  			cache_put_and_wake(c, b);
1885  			return NULL;
1886  		}
1887  	}
1888  
1889  	if (!b) {
1890  		if (nf == NF_GET)
1891  			return NULL;
1892  
1893  		dm_bufio_lock(c);
1894  		b = __bufio_new(c, block, nf, &need_submit, &write_list);
1895  		dm_bufio_unlock(c);
1896  	}
1897  
1898  #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1899  	if (b && (atomic_read(&b->hold_count) == 1))
1900  		buffer_record_stack(b);
1901  #endif
1902  
1903  	__flush_write_list(&write_list);
1904  
1905  	if (!b)
1906  		return NULL;
1907  
1908  	if (need_submit)
1909  		submit_io(b, REQ_OP_READ, ioprio, read_endio);
1910  
1911  	if (nf != NF_GET)	/* we already tested this condition above */
1912  		wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1913  
1914  	if (b->read_error) {
1915  		int error = blk_status_to_errno(b->read_error);
1916  
1917  		dm_bufio_release(b);
1918  
1919  		return ERR_PTR(error);
1920  	}
1921  
1922  	*bp = b;
1923  
1924  	return b->data;
1925  }
1926  
dm_bufio_get(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)1927  void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1928  		   struct dm_buffer **bp)
1929  {
1930  	return new_read(c, block, NF_GET, bp, IOPRIO_DEFAULT);
1931  }
1932  EXPORT_SYMBOL_GPL(dm_bufio_get);
1933  
__dm_bufio_read(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp,unsigned short ioprio)1934  static void *__dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1935  			struct dm_buffer **bp, unsigned short ioprio)
1936  {
1937  	if (WARN_ON_ONCE(dm_bufio_in_request()))
1938  		return ERR_PTR(-EINVAL);
1939  
1940  	return new_read(c, block, NF_READ, bp, ioprio);
1941  }
1942  
dm_bufio_read(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)1943  void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1944  		    struct dm_buffer **bp)
1945  {
1946  	return __dm_bufio_read(c, block, bp, IOPRIO_DEFAULT);
1947  }
1948  EXPORT_SYMBOL_GPL(dm_bufio_read);
1949  
dm_bufio_read_with_ioprio(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp,unsigned short ioprio)1950  void *dm_bufio_read_with_ioprio(struct dm_bufio_client *c, sector_t block,
1951  				struct dm_buffer **bp, unsigned short ioprio)
1952  {
1953  	return __dm_bufio_read(c, block, bp, ioprio);
1954  }
1955  EXPORT_SYMBOL_GPL(dm_bufio_read_with_ioprio);
1956  
dm_bufio_new(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)1957  void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1958  		   struct dm_buffer **bp)
1959  {
1960  	if (WARN_ON_ONCE(dm_bufio_in_request()))
1961  		return ERR_PTR(-EINVAL);
1962  
1963  	return new_read(c, block, NF_FRESH, bp, IOPRIO_DEFAULT);
1964  }
1965  EXPORT_SYMBOL_GPL(dm_bufio_new);
1966  
__dm_bufio_prefetch(struct dm_bufio_client * c,sector_t block,unsigned int n_blocks,unsigned short ioprio)1967  static void __dm_bufio_prefetch(struct dm_bufio_client *c,
1968  			sector_t block, unsigned int n_blocks,
1969  			unsigned short ioprio)
1970  {
1971  	struct blk_plug plug;
1972  
1973  	LIST_HEAD(write_list);
1974  
1975  	if (WARN_ON_ONCE(dm_bufio_in_request()))
1976  		return; /* should never happen */
1977  
1978  	blk_start_plug(&plug);
1979  
1980  	for (; n_blocks--; block++) {
1981  		int need_submit;
1982  		struct dm_buffer *b;
1983  
1984  		b = cache_get(&c->cache, block);
1985  		if (b) {
1986  			/* already in cache */
1987  			cache_put_and_wake(c, b);
1988  			continue;
1989  		}
1990  
1991  		dm_bufio_lock(c);
1992  		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1993  				&write_list);
1994  		if (unlikely(!list_empty(&write_list))) {
1995  			dm_bufio_unlock(c);
1996  			blk_finish_plug(&plug);
1997  			__flush_write_list(&write_list);
1998  			blk_start_plug(&plug);
1999  			dm_bufio_lock(c);
2000  		}
2001  		if (unlikely(b != NULL)) {
2002  			dm_bufio_unlock(c);
2003  
2004  			if (need_submit)
2005  				submit_io(b, REQ_OP_READ, ioprio, read_endio);
2006  			dm_bufio_release(b);
2007  
2008  			cond_resched();
2009  
2010  			if (!n_blocks)
2011  				goto flush_plug;
2012  			dm_bufio_lock(c);
2013  		}
2014  		dm_bufio_unlock(c);
2015  	}
2016  
2017  flush_plug:
2018  	blk_finish_plug(&plug);
2019  }
2020  
dm_bufio_prefetch(struct dm_bufio_client * c,sector_t block,unsigned int n_blocks)2021  void dm_bufio_prefetch(struct dm_bufio_client *c, sector_t block, unsigned int n_blocks)
2022  {
2023  	return __dm_bufio_prefetch(c, block, n_blocks, IOPRIO_DEFAULT);
2024  }
2025  EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
2026  
dm_bufio_prefetch_with_ioprio(struct dm_bufio_client * c,sector_t block,unsigned int n_blocks,unsigned short ioprio)2027  void dm_bufio_prefetch_with_ioprio(struct dm_bufio_client *c, sector_t block,
2028  				unsigned int n_blocks, unsigned short ioprio)
2029  {
2030  	return __dm_bufio_prefetch(c, block, n_blocks, ioprio);
2031  }
2032  EXPORT_SYMBOL_GPL(dm_bufio_prefetch_with_ioprio);
2033  
dm_bufio_release(struct dm_buffer * b)2034  void dm_bufio_release(struct dm_buffer *b)
2035  {
2036  	struct dm_bufio_client *c = b->c;
2037  
2038  	/*
2039  	 * If there were errors on the buffer, and the buffer is not
2040  	 * to be written, free the buffer. There is no point in caching
2041  	 * invalid buffer.
2042  	 */
2043  	if ((b->read_error || b->write_error) &&
2044  	    !test_bit_acquire(B_READING, &b->state) &&
2045  	    !test_bit(B_WRITING, &b->state) &&
2046  	    !test_bit(B_DIRTY, &b->state)) {
2047  		dm_bufio_lock(c);
2048  
2049  		/* cache remove can fail if there are other holders */
2050  		if (cache_remove(&c->cache, b)) {
2051  			__free_buffer_wake(b);
2052  			dm_bufio_unlock(c);
2053  			return;
2054  		}
2055  
2056  		dm_bufio_unlock(c);
2057  	}
2058  
2059  	cache_put_and_wake(c, b);
2060  }
2061  EXPORT_SYMBOL_GPL(dm_bufio_release);
2062  
dm_bufio_mark_partial_buffer_dirty(struct dm_buffer * b,unsigned int start,unsigned int end)2063  void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
2064  					unsigned int start, unsigned int end)
2065  {
2066  	struct dm_bufio_client *c = b->c;
2067  
2068  	BUG_ON(start >= end);
2069  	BUG_ON(end > b->c->block_size);
2070  
2071  	dm_bufio_lock(c);
2072  
2073  	BUG_ON(test_bit(B_READING, &b->state));
2074  
2075  	if (!test_and_set_bit(B_DIRTY, &b->state)) {
2076  		b->dirty_start = start;
2077  		b->dirty_end = end;
2078  		cache_mark(&c->cache, b, LIST_DIRTY);
2079  	} else {
2080  		if (start < b->dirty_start)
2081  			b->dirty_start = start;
2082  		if (end > b->dirty_end)
2083  			b->dirty_end = end;
2084  	}
2085  
2086  	dm_bufio_unlock(c);
2087  }
2088  EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
2089  
dm_bufio_mark_buffer_dirty(struct dm_buffer * b)2090  void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
2091  {
2092  	dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
2093  }
2094  EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
2095  
dm_bufio_write_dirty_buffers_async(struct dm_bufio_client * c)2096  void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
2097  {
2098  	LIST_HEAD(write_list);
2099  
2100  	if (WARN_ON_ONCE(dm_bufio_in_request()))
2101  		return; /* should never happen */
2102  
2103  	dm_bufio_lock(c);
2104  	__write_dirty_buffers_async(c, 0, &write_list);
2105  	dm_bufio_unlock(c);
2106  	__flush_write_list(&write_list);
2107  }
2108  EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
2109  
2110  /*
2111   * For performance, it is essential that the buffers are written asynchronously
2112   * and simultaneously (so that the block layer can merge the writes) and then
2113   * waited upon.
2114   *
2115   * Finally, we flush hardware disk cache.
2116   */
is_writing(struct lru_entry * e,void * context)2117  static bool is_writing(struct lru_entry *e, void *context)
2118  {
2119  	struct dm_buffer *b = le_to_buffer(e);
2120  
2121  	return test_bit(B_WRITING, &b->state);
2122  }
2123  
dm_bufio_write_dirty_buffers(struct dm_bufio_client * c)2124  int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
2125  {
2126  	int a, f;
2127  	unsigned long nr_buffers;
2128  	struct lru_entry *e;
2129  	struct lru_iter it;
2130  
2131  	LIST_HEAD(write_list);
2132  
2133  	dm_bufio_lock(c);
2134  	__write_dirty_buffers_async(c, 0, &write_list);
2135  	dm_bufio_unlock(c);
2136  	__flush_write_list(&write_list);
2137  	dm_bufio_lock(c);
2138  
2139  	nr_buffers = cache_count(&c->cache, LIST_DIRTY);
2140  	lru_iter_begin(&c->cache.lru[LIST_DIRTY], &it);
2141  	while ((e = lru_iter_next(&it, is_writing, c))) {
2142  		struct dm_buffer *b = le_to_buffer(e);
2143  		__cache_inc_buffer(b);
2144  
2145  		BUG_ON(test_bit(B_READING, &b->state));
2146  
2147  		if (nr_buffers) {
2148  			nr_buffers--;
2149  			dm_bufio_unlock(c);
2150  			wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2151  			dm_bufio_lock(c);
2152  		} else {
2153  			wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2154  		}
2155  
2156  		if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state))
2157  			cache_mark(&c->cache, b, LIST_CLEAN);
2158  
2159  		cache_put_and_wake(c, b);
2160  
2161  		cond_resched();
2162  	}
2163  	lru_iter_end(&it);
2164  
2165  	wake_up(&c->free_buffer_wait);
2166  	dm_bufio_unlock(c);
2167  
2168  	a = xchg(&c->async_write_error, 0);
2169  	f = dm_bufio_issue_flush(c);
2170  	if (a)
2171  		return a;
2172  
2173  	return f;
2174  }
2175  EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
2176  
2177  /*
2178   * Use dm-io to send an empty barrier to flush the device.
2179   */
dm_bufio_issue_flush(struct dm_bufio_client * c)2180  int dm_bufio_issue_flush(struct dm_bufio_client *c)
2181  {
2182  	struct dm_io_request io_req = {
2183  		.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC,
2184  		.mem.type = DM_IO_KMEM,
2185  		.mem.ptr.addr = NULL,
2186  		.client = c->dm_io,
2187  	};
2188  	struct dm_io_region io_reg = {
2189  		.bdev = c->bdev,
2190  		.sector = 0,
2191  		.count = 0,
2192  	};
2193  
2194  	if (WARN_ON_ONCE(dm_bufio_in_request()))
2195  		return -EINVAL;
2196  
2197  	return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT);
2198  }
2199  EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
2200  
2201  /*
2202   * Use dm-io to send a discard request to flush the device.
2203   */
dm_bufio_issue_discard(struct dm_bufio_client * c,sector_t block,sector_t count)2204  int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
2205  {
2206  	struct dm_io_request io_req = {
2207  		.bi_opf = REQ_OP_DISCARD | REQ_SYNC,
2208  		.mem.type = DM_IO_KMEM,
2209  		.mem.ptr.addr = NULL,
2210  		.client = c->dm_io,
2211  	};
2212  	struct dm_io_region io_reg = {
2213  		.bdev = c->bdev,
2214  		.sector = block_to_sector(c, block),
2215  		.count = block_to_sector(c, count),
2216  	};
2217  
2218  	if (WARN_ON_ONCE(dm_bufio_in_request()))
2219  		return -EINVAL; /* discards are optional */
2220  
2221  	return dm_io(&io_req, 1, &io_reg, NULL, IOPRIO_DEFAULT);
2222  }
2223  EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
2224  
forget_buffer(struct dm_bufio_client * c,sector_t block)2225  static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
2226  {
2227  	struct dm_buffer *b;
2228  
2229  	b = cache_get(&c->cache, block);
2230  	if (b) {
2231  		if (likely(!smp_load_acquire(&b->state))) {
2232  			if (cache_remove(&c->cache, b))
2233  				__free_buffer_wake(b);
2234  			else
2235  				cache_put_and_wake(c, b);
2236  		} else {
2237  			cache_put_and_wake(c, b);
2238  		}
2239  	}
2240  
2241  	return b ? true : false;
2242  }
2243  
2244  /*
2245   * Free the given buffer.
2246   *
2247   * This is just a hint, if the buffer is in use or dirty, this function
2248   * does nothing.
2249   */
dm_bufio_forget(struct dm_bufio_client * c,sector_t block)2250  void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
2251  {
2252  	dm_bufio_lock(c);
2253  	forget_buffer(c, block);
2254  	dm_bufio_unlock(c);
2255  }
2256  EXPORT_SYMBOL_GPL(dm_bufio_forget);
2257  
idle(struct dm_buffer * b,void * context)2258  static enum evict_result idle(struct dm_buffer *b, void *context)
2259  {
2260  	return b->state ? ER_DONT_EVICT : ER_EVICT;
2261  }
2262  
dm_bufio_forget_buffers(struct dm_bufio_client * c,sector_t block,sector_t n_blocks)2263  void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
2264  {
2265  	dm_bufio_lock(c);
2266  	cache_remove_range(&c->cache, block, block + n_blocks, idle, __free_buffer_wake);
2267  	dm_bufio_unlock(c);
2268  }
2269  EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
2270  
dm_bufio_set_minimum_buffers(struct dm_bufio_client * c,unsigned int n)2271  void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned int n)
2272  {
2273  	c->minimum_buffers = n;
2274  }
2275  EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
2276  
dm_bufio_get_block_size(struct dm_bufio_client * c)2277  unsigned int dm_bufio_get_block_size(struct dm_bufio_client *c)
2278  {
2279  	return c->block_size;
2280  }
2281  EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
2282  
dm_bufio_get_device_size(struct dm_bufio_client * c)2283  sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
2284  {
2285  	sector_t s = bdev_nr_sectors(c->bdev);
2286  
2287  	if (s >= c->start)
2288  		s -= c->start;
2289  	else
2290  		s = 0;
2291  	if (likely(c->sectors_per_block_bits >= 0))
2292  		s >>= c->sectors_per_block_bits;
2293  	else
2294  		sector_div(s, c->block_size >> SECTOR_SHIFT);
2295  	return s;
2296  }
2297  EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
2298  
dm_bufio_get_dm_io_client(struct dm_bufio_client * c)2299  struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
2300  {
2301  	return c->dm_io;
2302  }
2303  EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
2304  
dm_bufio_get_block_number(struct dm_buffer * b)2305  sector_t dm_bufio_get_block_number(struct dm_buffer *b)
2306  {
2307  	return b->block;
2308  }
2309  EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
2310  
dm_bufio_get_block_data(struct dm_buffer * b)2311  void *dm_bufio_get_block_data(struct dm_buffer *b)
2312  {
2313  	return b->data;
2314  }
2315  EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
2316  
dm_bufio_get_aux_data(struct dm_buffer * b)2317  void *dm_bufio_get_aux_data(struct dm_buffer *b)
2318  {
2319  	return b + 1;
2320  }
2321  EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
2322  
dm_bufio_get_client(struct dm_buffer * b)2323  struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
2324  {
2325  	return b->c;
2326  }
2327  EXPORT_SYMBOL_GPL(dm_bufio_get_client);
2328  
warn_leak(struct dm_buffer * b,void * context)2329  static enum it_action warn_leak(struct dm_buffer *b, void *context)
2330  {
2331  	bool *warned = context;
2332  
2333  	WARN_ON(!(*warned));
2334  	*warned = true;
2335  	DMERR("leaked buffer %llx, hold count %u, list %d",
2336  	      (unsigned long long)b->block, atomic_read(&b->hold_count), b->list_mode);
2337  #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2338  	stack_trace_print(b->stack_entries, b->stack_len, 1);
2339  	/* mark unclaimed to avoid WARN_ON at end of drop_buffers() */
2340  	atomic_set(&b->hold_count, 0);
2341  #endif
2342  	return IT_NEXT;
2343  }
2344  
drop_buffers(struct dm_bufio_client * c)2345  static void drop_buffers(struct dm_bufio_client *c)
2346  {
2347  	int i;
2348  	struct dm_buffer *b;
2349  
2350  	if (WARN_ON(dm_bufio_in_request()))
2351  		return; /* should never happen */
2352  
2353  	/*
2354  	 * An optimization so that the buffers are not written one-by-one.
2355  	 */
2356  	dm_bufio_write_dirty_buffers_async(c);
2357  
2358  	dm_bufio_lock(c);
2359  
2360  	while ((b = __get_unclaimed_buffer(c)))
2361  		__free_buffer_wake(b);
2362  
2363  	for (i = 0; i < LIST_SIZE; i++) {
2364  		bool warned = false;
2365  
2366  		cache_iterate(&c->cache, i, warn_leak, &warned);
2367  	}
2368  
2369  #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2370  	while ((b = __get_unclaimed_buffer(c)))
2371  		__free_buffer_wake(b);
2372  #endif
2373  
2374  	for (i = 0; i < LIST_SIZE; i++)
2375  		WARN_ON(cache_count(&c->cache, i));
2376  
2377  	dm_bufio_unlock(c);
2378  }
2379  
get_retain_buffers(struct dm_bufio_client * c)2380  static unsigned long get_retain_buffers(struct dm_bufio_client *c)
2381  {
2382  	unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
2383  
2384  	if (likely(c->sectors_per_block_bits >= 0))
2385  		retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
2386  	else
2387  		retain_bytes /= c->block_size;
2388  
2389  	return retain_bytes;
2390  }
2391  
__scan(struct dm_bufio_client * c)2392  static void __scan(struct dm_bufio_client *c)
2393  {
2394  	int l;
2395  	struct dm_buffer *b;
2396  	unsigned long freed = 0;
2397  	unsigned long retain_target = get_retain_buffers(c);
2398  	unsigned long count = cache_total(&c->cache);
2399  
2400  	for (l = 0; l < LIST_SIZE; l++) {
2401  		while (true) {
2402  			if (count - freed <= retain_target)
2403  				atomic_long_set(&c->need_shrink, 0);
2404  			if (!atomic_long_read(&c->need_shrink))
2405  				break;
2406  
2407  			b = cache_evict(&c->cache, l,
2408  					l == LIST_CLEAN ? is_clean : is_dirty, c);
2409  			if (!b)
2410  				break;
2411  
2412  			__make_buffer_clean(b);
2413  			__free_buffer_wake(b);
2414  
2415  			atomic_long_dec(&c->need_shrink);
2416  			freed++;
2417  			cond_resched();
2418  		}
2419  	}
2420  }
2421  
shrink_work(struct work_struct * w)2422  static void shrink_work(struct work_struct *w)
2423  {
2424  	struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
2425  
2426  	dm_bufio_lock(c);
2427  	__scan(c);
2428  	dm_bufio_unlock(c);
2429  }
2430  
dm_bufio_shrink_scan(struct shrinker * shrink,struct shrink_control * sc)2431  static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
2432  {
2433  	struct dm_bufio_client *c;
2434  
2435  	c = shrink->private_data;
2436  	atomic_long_add(sc->nr_to_scan, &c->need_shrink);
2437  	queue_work(dm_bufio_wq, &c->shrink_work);
2438  
2439  	return sc->nr_to_scan;
2440  }
2441  
dm_bufio_shrink_count(struct shrinker * shrink,struct shrink_control * sc)2442  static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
2443  {
2444  	struct dm_bufio_client *c = shrink->private_data;
2445  	unsigned long count = cache_total(&c->cache);
2446  	unsigned long retain_target = get_retain_buffers(c);
2447  	unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
2448  
2449  	if (unlikely(count < retain_target))
2450  		count = 0;
2451  	else
2452  		count -= retain_target;
2453  
2454  	if (unlikely(count < queued_for_cleanup))
2455  		count = 0;
2456  	else
2457  		count -= queued_for_cleanup;
2458  
2459  	return count;
2460  }
2461  
2462  /*
2463   * Create the buffering interface
2464   */
dm_bufio_client_create(struct block_device * bdev,unsigned int block_size,unsigned int reserved_buffers,unsigned int aux_size,void (* alloc_callback)(struct dm_buffer *),void (* write_callback)(struct dm_buffer *),unsigned int flags)2465  struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned int block_size,
2466  					       unsigned int reserved_buffers, unsigned int aux_size,
2467  					       void (*alloc_callback)(struct dm_buffer *),
2468  					       void (*write_callback)(struct dm_buffer *),
2469  					       unsigned int flags)
2470  {
2471  	int r;
2472  	unsigned int num_locks;
2473  	struct dm_bufio_client *c;
2474  	char slab_name[64];
2475  	static atomic_t seqno = ATOMIC_INIT(0);
2476  
2477  	if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
2478  		DMERR("%s: block size not specified or is not multiple of 512b", __func__);
2479  		r = -EINVAL;
2480  		goto bad_client;
2481  	}
2482  
2483  	num_locks = dm_num_hash_locks();
2484  	c = kzalloc(sizeof(*c) + (num_locks * sizeof(struct buffer_tree)), GFP_KERNEL);
2485  	if (!c) {
2486  		r = -ENOMEM;
2487  		goto bad_client;
2488  	}
2489  	cache_init(&c->cache, num_locks, (flags & DM_BUFIO_CLIENT_NO_SLEEP) != 0);
2490  
2491  	c->bdev = bdev;
2492  	c->block_size = block_size;
2493  	if (is_power_of_2(block_size))
2494  		c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
2495  	else
2496  		c->sectors_per_block_bits = -1;
2497  
2498  	c->alloc_callback = alloc_callback;
2499  	c->write_callback = write_callback;
2500  
2501  	if (flags & DM_BUFIO_CLIENT_NO_SLEEP) {
2502  		c->no_sleep = true;
2503  		static_branch_inc(&no_sleep_enabled);
2504  	}
2505  
2506  	mutex_init(&c->lock);
2507  	spin_lock_init(&c->spinlock);
2508  	INIT_LIST_HEAD(&c->reserved_buffers);
2509  	c->need_reserved_buffers = reserved_buffers;
2510  
2511  	dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
2512  
2513  	init_waitqueue_head(&c->free_buffer_wait);
2514  	c->async_write_error = 0;
2515  
2516  	c->dm_io = dm_io_client_create();
2517  	if (IS_ERR(c->dm_io)) {
2518  		r = PTR_ERR(c->dm_io);
2519  		goto bad_dm_io;
2520  	}
2521  
2522  	if (block_size <= KMALLOC_MAX_SIZE &&
2523  	    (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
2524  		unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE);
2525  
2526  		snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u-%u",
2527  					block_size, atomic_inc_return(&seqno));
2528  		c->slab_cache = kmem_cache_create(slab_name, block_size, align,
2529  						  SLAB_RECLAIM_ACCOUNT, NULL);
2530  		if (!c->slab_cache) {
2531  			r = -ENOMEM;
2532  			goto bad;
2533  		}
2534  	}
2535  	if (aux_size)
2536  		snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u-%u",
2537  					aux_size, atomic_inc_return(&seqno));
2538  	else
2539  		snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u",
2540  					atomic_inc_return(&seqno));
2541  	c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
2542  					   0, SLAB_RECLAIM_ACCOUNT, NULL);
2543  	if (!c->slab_buffer) {
2544  		r = -ENOMEM;
2545  		goto bad;
2546  	}
2547  
2548  	while (c->need_reserved_buffers) {
2549  		struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
2550  
2551  		if (!b) {
2552  			r = -ENOMEM;
2553  			goto bad;
2554  		}
2555  		__free_buffer_wake(b);
2556  	}
2557  
2558  	INIT_WORK(&c->shrink_work, shrink_work);
2559  	atomic_long_set(&c->need_shrink, 0);
2560  
2561  	c->shrinker = shrinker_alloc(0, "dm-bufio:(%u:%u)",
2562  				     MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
2563  	if (!c->shrinker) {
2564  		r = -ENOMEM;
2565  		goto bad;
2566  	}
2567  
2568  	c->shrinker->count_objects = dm_bufio_shrink_count;
2569  	c->shrinker->scan_objects = dm_bufio_shrink_scan;
2570  	c->shrinker->seeks = 1;
2571  	c->shrinker->batch = 0;
2572  	c->shrinker->private_data = c;
2573  
2574  	shrinker_register(c->shrinker);
2575  
2576  	mutex_lock(&dm_bufio_clients_lock);
2577  	dm_bufio_client_count++;
2578  	list_add(&c->client_list, &dm_bufio_all_clients);
2579  	__cache_size_refresh();
2580  	mutex_unlock(&dm_bufio_clients_lock);
2581  
2582  	return c;
2583  
2584  bad:
2585  	while (!list_empty(&c->reserved_buffers)) {
2586  		struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2587  
2588  		list_del(&b->lru.list);
2589  		free_buffer(b);
2590  	}
2591  	kmem_cache_destroy(c->slab_cache);
2592  	kmem_cache_destroy(c->slab_buffer);
2593  	dm_io_client_destroy(c->dm_io);
2594  bad_dm_io:
2595  	mutex_destroy(&c->lock);
2596  	if (c->no_sleep)
2597  		static_branch_dec(&no_sleep_enabled);
2598  	kfree(c);
2599  bad_client:
2600  	return ERR_PTR(r);
2601  }
2602  EXPORT_SYMBOL_GPL(dm_bufio_client_create);
2603  
2604  /*
2605   * Free the buffering interface.
2606   * It is required that there are no references on any buffers.
2607   */
dm_bufio_client_destroy(struct dm_bufio_client * c)2608  void dm_bufio_client_destroy(struct dm_bufio_client *c)
2609  {
2610  	unsigned int i;
2611  
2612  	drop_buffers(c);
2613  
2614  	shrinker_free(c->shrinker);
2615  	flush_work(&c->shrink_work);
2616  
2617  	mutex_lock(&dm_bufio_clients_lock);
2618  
2619  	list_del(&c->client_list);
2620  	dm_bufio_client_count--;
2621  	__cache_size_refresh();
2622  
2623  	mutex_unlock(&dm_bufio_clients_lock);
2624  
2625  	WARN_ON(c->need_reserved_buffers);
2626  
2627  	while (!list_empty(&c->reserved_buffers)) {
2628  		struct dm_buffer *b = list_to_buffer(c->reserved_buffers.next);
2629  
2630  		list_del(&b->lru.list);
2631  		free_buffer(b);
2632  	}
2633  
2634  	for (i = 0; i < LIST_SIZE; i++)
2635  		if (cache_count(&c->cache, i))
2636  			DMERR("leaked buffer count %d: %lu", i, cache_count(&c->cache, i));
2637  
2638  	for (i = 0; i < LIST_SIZE; i++)
2639  		WARN_ON(cache_count(&c->cache, i));
2640  
2641  	cache_destroy(&c->cache);
2642  	kmem_cache_destroy(c->slab_cache);
2643  	kmem_cache_destroy(c->slab_buffer);
2644  	dm_io_client_destroy(c->dm_io);
2645  	mutex_destroy(&c->lock);
2646  	if (c->no_sleep)
2647  		static_branch_dec(&no_sleep_enabled);
2648  	kfree(c);
2649  }
2650  EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
2651  
dm_bufio_client_reset(struct dm_bufio_client * c)2652  void dm_bufio_client_reset(struct dm_bufio_client *c)
2653  {
2654  	drop_buffers(c);
2655  	flush_work(&c->shrink_work);
2656  }
2657  EXPORT_SYMBOL_GPL(dm_bufio_client_reset);
2658  
dm_bufio_set_sector_offset(struct dm_bufio_client * c,sector_t start)2659  void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
2660  {
2661  	c->start = start;
2662  }
2663  EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
2664  
2665  /*--------------------------------------------------------------*/
2666  
get_max_age_hz(void)2667  static unsigned int get_max_age_hz(void)
2668  {
2669  	unsigned int max_age = READ_ONCE(dm_bufio_max_age);
2670  
2671  	if (max_age > UINT_MAX / HZ)
2672  		max_age = UINT_MAX / HZ;
2673  
2674  	return max_age * HZ;
2675  }
2676  
older_than(struct dm_buffer * b,unsigned long age_hz)2677  static bool older_than(struct dm_buffer *b, unsigned long age_hz)
2678  {
2679  	return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz);
2680  }
2681  
2682  struct evict_params {
2683  	gfp_t gfp;
2684  	unsigned long age_hz;
2685  
2686  	/*
2687  	 * This gets updated with the largest last_accessed (ie. most
2688  	 * recently used) of the evicted buffers.  It will not be reinitialised
2689  	 * by __evict_many(), so you can use it across multiple invocations.
2690  	 */
2691  	unsigned long last_accessed;
2692  };
2693  
2694  /*
2695   * We may not be able to evict this buffer if IO pending or the client
2696   * is still using it.
2697   *
2698   * And if GFP_NOFS is used, we must not do any I/O because we hold
2699   * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
2700   * rerouted to different bufio client.
2701   */
select_for_evict(struct dm_buffer * b,void * context)2702  static enum evict_result select_for_evict(struct dm_buffer *b, void *context)
2703  {
2704  	struct evict_params *params = context;
2705  
2706  	if (!(params->gfp & __GFP_FS) ||
2707  	    (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
2708  		if (test_bit_acquire(B_READING, &b->state) ||
2709  		    test_bit(B_WRITING, &b->state) ||
2710  		    test_bit(B_DIRTY, &b->state))
2711  			return ER_DONT_EVICT;
2712  	}
2713  
2714  	return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP;
2715  }
2716  
__evict_many(struct dm_bufio_client * c,struct evict_params * params,int list_mode,unsigned long max_count)2717  static unsigned long __evict_many(struct dm_bufio_client *c,
2718  				  struct evict_params *params,
2719  				  int list_mode, unsigned long max_count)
2720  {
2721  	unsigned long count;
2722  	unsigned long last_accessed;
2723  	struct dm_buffer *b;
2724  
2725  	for (count = 0; count < max_count; count++) {
2726  		b = cache_evict(&c->cache, list_mode, select_for_evict, params);
2727  		if (!b)
2728  			break;
2729  
2730  		last_accessed = READ_ONCE(b->last_accessed);
2731  		if (time_after_eq(params->last_accessed, last_accessed))
2732  			params->last_accessed = last_accessed;
2733  
2734  		__make_buffer_clean(b);
2735  		__free_buffer_wake(b);
2736  
2737  		cond_resched();
2738  	}
2739  
2740  	return count;
2741  }
2742  
evict_old_buffers(struct dm_bufio_client * c,unsigned long age_hz)2743  static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
2744  {
2745  	struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0};
2746  	unsigned long retain = get_retain_buffers(c);
2747  	unsigned long count;
2748  	LIST_HEAD(write_list);
2749  
2750  	dm_bufio_lock(c);
2751  
2752  	__check_watermark(c, &write_list);
2753  	if (unlikely(!list_empty(&write_list))) {
2754  		dm_bufio_unlock(c);
2755  		__flush_write_list(&write_list);
2756  		dm_bufio_lock(c);
2757  	}
2758  
2759  	count = cache_total(&c->cache);
2760  	if (count > retain)
2761  		__evict_many(c, &params, LIST_CLEAN, count - retain);
2762  
2763  	dm_bufio_unlock(c);
2764  }
2765  
cleanup_old_buffers(void)2766  static void cleanup_old_buffers(void)
2767  {
2768  	unsigned long max_age_hz = get_max_age_hz();
2769  	struct dm_bufio_client *c;
2770  
2771  	mutex_lock(&dm_bufio_clients_lock);
2772  
2773  	__cache_size_refresh();
2774  
2775  	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
2776  		evict_old_buffers(c, max_age_hz);
2777  
2778  	mutex_unlock(&dm_bufio_clients_lock);
2779  }
2780  
work_fn(struct work_struct * w)2781  static void work_fn(struct work_struct *w)
2782  {
2783  	cleanup_old_buffers();
2784  
2785  	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2786  			   DM_BUFIO_WORK_TIMER_SECS * HZ);
2787  }
2788  
2789  /*--------------------------------------------------------------*/
2790  
2791  /*
2792   * Global cleanup tries to evict the oldest buffers from across _all_
2793   * the clients.  It does this by repeatedly evicting a few buffers from
2794   * the client that holds the oldest buffer.  It's approximate, but hopefully
2795   * good enough.
2796   */
__pop_client(void)2797  static struct dm_bufio_client *__pop_client(void)
2798  {
2799  	struct list_head *h;
2800  
2801  	if (list_empty(&dm_bufio_all_clients))
2802  		return NULL;
2803  
2804  	h = dm_bufio_all_clients.next;
2805  	list_del(h);
2806  	return container_of(h, struct dm_bufio_client, client_list);
2807  }
2808  
2809  /*
2810   * Inserts the client in the global client list based on its
2811   * 'oldest_buffer' field.
2812   */
__insert_client(struct dm_bufio_client * new_client)2813  static void __insert_client(struct dm_bufio_client *new_client)
2814  {
2815  	struct dm_bufio_client *c;
2816  	struct list_head *h = dm_bufio_all_clients.next;
2817  
2818  	while (h != &dm_bufio_all_clients) {
2819  		c = container_of(h, struct dm_bufio_client, client_list);
2820  		if (time_after_eq(c->oldest_buffer, new_client->oldest_buffer))
2821  			break;
2822  		h = h->next;
2823  	}
2824  
2825  	list_add_tail(&new_client->client_list, h);
2826  }
2827  
__evict_a_few(unsigned long nr_buffers)2828  static unsigned long __evict_a_few(unsigned long nr_buffers)
2829  {
2830  	unsigned long count;
2831  	struct dm_bufio_client *c;
2832  	struct evict_params params = {
2833  		.gfp = GFP_KERNEL,
2834  		.age_hz = 0,
2835  		/* set to jiffies in case there are no buffers in this client */
2836  		.last_accessed = jiffies
2837  	};
2838  
2839  	c = __pop_client();
2840  	if (!c)
2841  		return 0;
2842  
2843  	dm_bufio_lock(c);
2844  	count = __evict_many(c, &params, LIST_CLEAN, nr_buffers);
2845  	dm_bufio_unlock(c);
2846  
2847  	if (count)
2848  		c->oldest_buffer = params.last_accessed;
2849  	__insert_client(c);
2850  
2851  	return count;
2852  }
2853  
check_watermarks(void)2854  static void check_watermarks(void)
2855  {
2856  	LIST_HEAD(write_list);
2857  	struct dm_bufio_client *c;
2858  
2859  	mutex_lock(&dm_bufio_clients_lock);
2860  	list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
2861  		dm_bufio_lock(c);
2862  		__check_watermark(c, &write_list);
2863  		dm_bufio_unlock(c);
2864  	}
2865  	mutex_unlock(&dm_bufio_clients_lock);
2866  
2867  	__flush_write_list(&write_list);
2868  }
2869  
evict_old(void)2870  static void evict_old(void)
2871  {
2872  	unsigned long threshold = dm_bufio_cache_size -
2873  		dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
2874  
2875  	mutex_lock(&dm_bufio_clients_lock);
2876  	while (dm_bufio_current_allocated > threshold) {
2877  		if (!__evict_a_few(64))
2878  			break;
2879  		cond_resched();
2880  	}
2881  	mutex_unlock(&dm_bufio_clients_lock);
2882  }
2883  
do_global_cleanup(struct work_struct * w)2884  static void do_global_cleanup(struct work_struct *w)
2885  {
2886  	check_watermarks();
2887  	evict_old();
2888  }
2889  
2890  /*
2891   *--------------------------------------------------------------
2892   * Module setup
2893   *--------------------------------------------------------------
2894   */
2895  
2896  /*
2897   * This is called only once for the whole dm_bufio module.
2898   * It initializes memory limit.
2899   */
dm_bufio_init(void)2900  static int __init dm_bufio_init(void)
2901  {
2902  	__u64 mem;
2903  
2904  	dm_bufio_allocated_kmem_cache = 0;
2905  	dm_bufio_allocated_get_free_pages = 0;
2906  	dm_bufio_allocated_vmalloc = 0;
2907  	dm_bufio_current_allocated = 0;
2908  
2909  	mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2910  			       DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
2911  
2912  	if (mem > ULONG_MAX)
2913  		mem = ULONG_MAX;
2914  
2915  #ifdef CONFIG_MMU
2916  	if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
2917  		mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
2918  #endif
2919  
2920  	dm_bufio_default_cache_size = mem;
2921  
2922  	mutex_lock(&dm_bufio_clients_lock);
2923  	__cache_size_refresh();
2924  	mutex_unlock(&dm_bufio_clients_lock);
2925  
2926  	dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2927  	if (!dm_bufio_wq)
2928  		return -ENOMEM;
2929  
2930  	INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2931  	INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2932  	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2933  			   DM_BUFIO_WORK_TIMER_SECS * HZ);
2934  
2935  	return 0;
2936  }
2937  
2938  /*
2939   * This is called once when unloading the dm_bufio module.
2940   */
dm_bufio_exit(void)2941  static void __exit dm_bufio_exit(void)
2942  {
2943  	int bug = 0;
2944  
2945  	cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2946  	destroy_workqueue(dm_bufio_wq);
2947  
2948  	if (dm_bufio_client_count) {
2949  		DMCRIT("%s: dm_bufio_client_count leaked: %d",
2950  			__func__, dm_bufio_client_count);
2951  		bug = 1;
2952  	}
2953  
2954  	if (dm_bufio_current_allocated) {
2955  		DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2956  			__func__, dm_bufio_current_allocated);
2957  		bug = 1;
2958  	}
2959  
2960  	if (dm_bufio_allocated_get_free_pages) {
2961  		DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2962  		       __func__, dm_bufio_allocated_get_free_pages);
2963  		bug = 1;
2964  	}
2965  
2966  	if (dm_bufio_allocated_vmalloc) {
2967  		DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2968  		       __func__, dm_bufio_allocated_vmalloc);
2969  		bug = 1;
2970  	}
2971  
2972  	WARN_ON(bug); /* leaks are not worth crashing the system */
2973  }
2974  
2975  module_init(dm_bufio_init)
2976  module_exit(dm_bufio_exit)
2977  
2978  module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644);
2979  MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2980  
2981  module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644);
2982  MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2983  
2984  module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644);
2985  MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2986  
2987  module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, 0644);
2988  MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2989  
2990  module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, 0444);
2991  MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2992  
2993  module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, 0444);
2994  MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2995  
2996  module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, 0444);
2997  MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2998  
2999  module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, 0444);
3000  MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
3001  
3002  MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>");
3003  MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
3004  MODULE_LICENSE("GPL");
3005