1  // SPDX-License-Identifier: GPL-2.0
2  
3  #include <linux/jiffies.h>
4  #include <linux/kernel.h>
5  #include <linux/ktime.h>
6  #include <linux/list.h>
7  #include <linux/math64.h>
8  #include <linux/sizes.h>
9  #include <linux/workqueue.h>
10  #include "ctree.h"
11  #include "block-group.h"
12  #include "discard.h"
13  #include "free-space-cache.h"
14  #include "fs.h"
15  
16  /*
17   * This contains the logic to handle async discard.
18   *
19   * Async discard manages trimming of free space outside of transaction commit.
20   * Discarding is done by managing the block_groups on a LRU list based on free
21   * space recency.  Two passes are used to first prioritize discarding extents
22   * and then allow for trimming in the bitmap the best opportunity to coalesce.
23   * The block_groups are maintained on multiple lists to allow for multiple
24   * passes with different discard filter requirements.  A delayed work item is
25   * used to manage discarding with timeout determined by a max of the delay
26   * incurred by the iops rate limit, the byte rate limit, and the max delay of
27   * BTRFS_DISCARD_MAX_DELAY.
28   *
29   * Note, this only keeps track of block_groups that are explicitly for data.
30   * Mixed block_groups are not supported.
31   *
32   * The first list is special to manage discarding of fully free block groups.
33   * This is necessary because we issue a final trim for a full free block group
34   * after forgetting it.  When a block group becomes unused, instead of directly
35   * being added to the unused_bgs list, we add it to this first list.  Then
36   * from there, if it becomes fully discarded, we place it onto the unused_bgs
37   * list.
38   *
39   * The in-memory free space cache serves as the backing state for discard.
40   * Consequently this means there is no persistence.  We opt to load all the
41   * block groups in as not discarded, so the mount case degenerates to the
42   * crashing case.
43   *
44   * As the free space cache uses bitmaps, there exists a tradeoff between
45   * ease/efficiency for find_free_extent() and the accuracy of discard state.
46   * Here we opt to let untrimmed regions merge with everything while only letting
47   * trimmed regions merge with other trimmed regions.  This can cause
48   * overtrimming, but the coalescing benefit seems to be worth it.  Additionally,
49   * bitmap state is tracked as a whole.  If we're able to fully trim a bitmap,
50   * the trimmed flag is set on the bitmap.  Otherwise, if an allocation comes in,
51   * this resets the state and we will retry trimming the whole bitmap.  This is a
52   * tradeoff between discard state accuracy and the cost of accounting.
53   */
54  
55  /* This is an initial delay to give some chance for block reuse */
56  #define BTRFS_DISCARD_DELAY		(120ULL * NSEC_PER_SEC)
57  #define BTRFS_DISCARD_UNUSED_DELAY	(10ULL * NSEC_PER_SEC)
58  
59  #define BTRFS_DISCARD_MIN_DELAY_MSEC	(1UL)
60  #define BTRFS_DISCARD_MAX_DELAY_MSEC	(1000UL)
61  #define BTRFS_DISCARD_MAX_IOPS		(1000U)
62  
63  /* Monotonically decreasing minimum length filters after index 0 */
64  static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
65  	0,
66  	BTRFS_ASYNC_DISCARD_MAX_FILTER,
67  	BTRFS_ASYNC_DISCARD_MIN_FILTER
68  };
69  
get_discard_list(struct btrfs_discard_ctl * discard_ctl,const struct btrfs_block_group * block_group)70  static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
71  					  const struct btrfs_block_group *block_group)
72  {
73  	return &discard_ctl->discard_list[block_group->discard_index];
74  }
75  
76  /*
77   * Determine if async discard should be running.
78   *
79   * @discard_ctl: discard control
80   *
81   * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
82   */
btrfs_run_discard_work(const struct btrfs_discard_ctl * discard_ctl)83  static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
84  {
85  	struct btrfs_fs_info *fs_info = container_of(discard_ctl,
86  						     struct btrfs_fs_info,
87  						     discard_ctl);
88  
89  	return (!(fs_info->sb->s_flags & SB_RDONLY) &&
90  		test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
91  }
92  
__add_to_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)93  static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
94  				  struct btrfs_block_group *block_group)
95  {
96  	lockdep_assert_held(&discard_ctl->lock);
97  	if (!btrfs_run_discard_work(discard_ctl))
98  		return;
99  
100  	if (list_empty(&block_group->discard_list) ||
101  	    block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
102  		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
103  			block_group->discard_index = BTRFS_DISCARD_INDEX_START;
104  		block_group->discard_eligible_time = (ktime_get_ns() +
105  						      BTRFS_DISCARD_DELAY);
106  		block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
107  	}
108  	if (list_empty(&block_group->discard_list))
109  		btrfs_get_block_group(block_group);
110  
111  	list_move_tail(&block_group->discard_list,
112  		       get_discard_list(discard_ctl, block_group));
113  }
114  
add_to_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)115  static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
116  				struct btrfs_block_group *block_group)
117  {
118  	if (!btrfs_is_block_group_data_only(block_group))
119  		return;
120  
121  	spin_lock(&discard_ctl->lock);
122  	__add_to_discard_list(discard_ctl, block_group);
123  	spin_unlock(&discard_ctl->lock);
124  }
125  
add_to_discard_unused_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)126  static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
127  				       struct btrfs_block_group *block_group)
128  {
129  	bool queued;
130  
131  	spin_lock(&discard_ctl->lock);
132  
133  	queued = !list_empty(&block_group->discard_list);
134  
135  	if (!btrfs_run_discard_work(discard_ctl)) {
136  		spin_unlock(&discard_ctl->lock);
137  		return;
138  	}
139  
140  	list_del_init(&block_group->discard_list);
141  
142  	block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
143  	block_group->discard_eligible_time = (ktime_get_ns() +
144  					      BTRFS_DISCARD_UNUSED_DELAY);
145  	block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
146  	if (!queued)
147  		btrfs_get_block_group(block_group);
148  	list_add_tail(&block_group->discard_list,
149  		      &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
150  
151  	spin_unlock(&discard_ctl->lock);
152  }
153  
remove_from_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)154  static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
155  				     struct btrfs_block_group *block_group)
156  {
157  	bool running = false;
158  	bool queued = false;
159  
160  	spin_lock(&discard_ctl->lock);
161  
162  	if (block_group == discard_ctl->block_group) {
163  		running = true;
164  		discard_ctl->block_group = NULL;
165  	}
166  
167  	block_group->discard_eligible_time = 0;
168  	queued = !list_empty(&block_group->discard_list);
169  	list_del_init(&block_group->discard_list);
170  	/*
171  	 * If the block group is currently running in the discard workfn, we
172  	 * don't want to deref it, since it's still being used by the workfn.
173  	 * The workfn will notice this case and deref the block group when it is
174  	 * finished.
175  	 */
176  	if (queued && !running)
177  		btrfs_put_block_group(block_group);
178  
179  	spin_unlock(&discard_ctl->lock);
180  
181  	return running;
182  }
183  
184  /*
185   * Find block_group that's up next for discarding.
186   *
187   * @discard_ctl:  discard control
188   * @now:          current time
189   *
190   * Iterate over the discard lists to find the next block_group up for
191   * discarding checking the discard_eligible_time of block_group.
192   */
find_next_block_group(struct btrfs_discard_ctl * discard_ctl,u64 now)193  static struct btrfs_block_group *find_next_block_group(
194  					struct btrfs_discard_ctl *discard_ctl,
195  					u64 now)
196  {
197  	struct btrfs_block_group *ret_block_group = NULL, *block_group;
198  	int i;
199  
200  	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
201  		struct list_head *discard_list = &discard_ctl->discard_list[i];
202  
203  		if (!list_empty(discard_list)) {
204  			block_group = list_first_entry(discard_list,
205  						       struct btrfs_block_group,
206  						       discard_list);
207  
208  			if (!ret_block_group)
209  				ret_block_group = block_group;
210  
211  			if (ret_block_group->discard_eligible_time < now)
212  				break;
213  
214  			if (ret_block_group->discard_eligible_time >
215  			    block_group->discard_eligible_time)
216  				ret_block_group = block_group;
217  		}
218  	}
219  
220  	return ret_block_group;
221  }
222  
223  /*
224   * Look up next block group and set it for use.
225   *
226   * @discard_ctl:   discard control
227   * @discard_state: the discard_state of the block_group after state management
228   * @discard_index: the discard_index of the block_group after state management
229   * @now:           time when discard was invoked, in ns
230   *
231   * Wrap find_next_block_group() and set the block_group to be in use.
232   * @discard_state's control flow is managed here.  Variables related to
233   * @discard_state are reset here as needed (eg. @discard_cursor).  @discard_state
234   * and @discard_index are remembered as it may change while we're discarding,
235   * but we want the discard to execute in the context determined here.
236   */
peek_discard_list(struct btrfs_discard_ctl * discard_ctl,enum btrfs_discard_state * discard_state,int * discard_index,u64 now)237  static struct btrfs_block_group *peek_discard_list(
238  					struct btrfs_discard_ctl *discard_ctl,
239  					enum btrfs_discard_state *discard_state,
240  					int *discard_index, u64 now)
241  {
242  	struct btrfs_block_group *block_group;
243  
244  	spin_lock(&discard_ctl->lock);
245  again:
246  	block_group = find_next_block_group(discard_ctl, now);
247  
248  	if (block_group && now >= block_group->discard_eligible_time) {
249  		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
250  		    block_group->used != 0) {
251  			if (btrfs_is_block_group_data_only(block_group)) {
252  				__add_to_discard_list(discard_ctl, block_group);
253  			} else {
254  				list_del_init(&block_group->discard_list);
255  				btrfs_put_block_group(block_group);
256  			}
257  			goto again;
258  		}
259  		if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
260  			block_group->discard_cursor = block_group->start;
261  			block_group->discard_state = BTRFS_DISCARD_EXTENTS;
262  		}
263  		discard_ctl->block_group = block_group;
264  	}
265  	if (block_group) {
266  		*discard_state = block_group->discard_state;
267  		*discard_index = block_group->discard_index;
268  	}
269  	spin_unlock(&discard_ctl->lock);
270  
271  	return block_group;
272  }
273  
274  /*
275   * Update a block group's filters.
276   *
277   * @block_group:  block group of interest
278   * @bytes:        recently freed region size after coalescing
279   *
280   * Async discard maintains multiple lists with progressively smaller filters
281   * to prioritize discarding based on size.  Should a free space that matches
282   * a larger filter be returned to the free_space_cache, prioritize that discard
283   * by moving @block_group to the proper filter.
284   */
btrfs_discard_check_filter(struct btrfs_block_group * block_group,u64 bytes)285  void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
286  				u64 bytes)
287  {
288  	struct btrfs_discard_ctl *discard_ctl;
289  
290  	if (!block_group ||
291  	    !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
292  		return;
293  
294  	discard_ctl = &block_group->fs_info->discard_ctl;
295  
296  	if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
297  	    bytes >= discard_minlen[block_group->discard_index - 1]) {
298  		int i;
299  
300  		remove_from_discard_list(discard_ctl, block_group);
301  
302  		for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
303  		     i++) {
304  			if (bytes >= discard_minlen[i]) {
305  				block_group->discard_index = i;
306  				add_to_discard_list(discard_ctl, block_group);
307  				break;
308  			}
309  		}
310  	}
311  }
312  
313  /*
314   * Move a block group along the discard lists.
315   *
316   * @discard_ctl: discard control
317   * @block_group: block_group of interest
318   *
319   * Increment @block_group's discard_index.  If it falls of the list, let it be.
320   * Otherwise add it back to the appropriate list.
321   */
btrfs_update_discard_index(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)322  static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
323  				       struct btrfs_block_group *block_group)
324  {
325  	block_group->discard_index++;
326  	if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
327  		block_group->discard_index = 1;
328  		return;
329  	}
330  
331  	add_to_discard_list(discard_ctl, block_group);
332  }
333  
334  /*
335   * Remove a block_group from the discard lists.
336   *
337   * @discard_ctl: discard control
338   * @block_group: block_group of interest
339   *
340   * Remove @block_group from the discard lists.  If necessary, wait on the
341   * current work and then reschedule the delayed work.
342   */
btrfs_discard_cancel_work(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)343  void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
344  			       struct btrfs_block_group *block_group)
345  {
346  	if (remove_from_discard_list(discard_ctl, block_group)) {
347  		cancel_delayed_work_sync(&discard_ctl->work);
348  		btrfs_discard_schedule_work(discard_ctl, true);
349  	}
350  }
351  
352  /*
353   * Handles queuing the block_groups.
354   *
355   * @discard_ctl: discard control
356   * @block_group: block_group of interest
357   *
358   * Maintain the LRU order of the discard lists.
359   */
btrfs_discard_queue_work(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)360  void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
361  			      struct btrfs_block_group *block_group)
362  {
363  	if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
364  		return;
365  
366  	if (block_group->used == 0)
367  		add_to_discard_unused_list(discard_ctl, block_group);
368  	else
369  		add_to_discard_list(discard_ctl, block_group);
370  
371  	if (!delayed_work_pending(&discard_ctl->work))
372  		btrfs_discard_schedule_work(discard_ctl, false);
373  }
374  
__btrfs_discard_schedule_work(struct btrfs_discard_ctl * discard_ctl,u64 now,bool override)375  static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
376  					  u64 now, bool override)
377  {
378  	struct btrfs_block_group *block_group;
379  
380  	if (!btrfs_run_discard_work(discard_ctl))
381  		return;
382  	if (!override && delayed_work_pending(&discard_ctl->work))
383  		return;
384  
385  	block_group = find_next_block_group(discard_ctl, now);
386  	if (block_group) {
387  		u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
388  		u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
389  
390  		/*
391  		 * A single delayed workqueue item is responsible for
392  		 * discarding, so we can manage the bytes rate limit by keeping
393  		 * track of the previous discard.
394  		 */
395  		if (kbps_limit && discard_ctl->prev_discard) {
396  			u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
397  			u64 bps_delay = div64_u64(discard_ctl->prev_discard *
398  						  NSEC_PER_SEC, bps_limit);
399  
400  			delay = max(delay, bps_delay);
401  		}
402  
403  		/*
404  		 * This timeout is to hopefully prevent immediate discarding
405  		 * in a recently allocated block group.
406  		 */
407  		if (now < block_group->discard_eligible_time) {
408  			u64 bg_timeout = block_group->discard_eligible_time - now;
409  
410  			delay = max(delay, bg_timeout);
411  		}
412  
413  		if (override && discard_ctl->prev_discard) {
414  			u64 elapsed = now - discard_ctl->prev_discard_time;
415  
416  			if (delay > elapsed)
417  				delay -= elapsed;
418  			else
419  				delay = 0;
420  		}
421  
422  		mod_delayed_work(discard_ctl->discard_workers,
423  				 &discard_ctl->work, nsecs_to_jiffies(delay));
424  	}
425  }
426  
427  /*
428   * Responsible for scheduling the discard work.
429   *
430   * @discard_ctl:  discard control
431   * @override:     override the current timer
432   *
433   * Discards are issued by a delayed workqueue item.  @override is used to
434   * update the current delay as the baseline delay interval is reevaluated on
435   * transaction commit.  This is also maxed with any other rate limit.
436   */
btrfs_discard_schedule_work(struct btrfs_discard_ctl * discard_ctl,bool override)437  void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
438  				 bool override)
439  {
440  	const u64 now = ktime_get_ns();
441  
442  	spin_lock(&discard_ctl->lock);
443  	__btrfs_discard_schedule_work(discard_ctl, now, override);
444  	spin_unlock(&discard_ctl->lock);
445  }
446  
447  /*
448   * Determine next step of a block_group.
449   *
450   * @discard_ctl: discard control
451   * @block_group: block_group of interest
452   *
453   * Determine the next step for a block group after it's finished going through
454   * a pass on a discard list.  If it is unused and fully trimmed, we can mark it
455   * unused and send it to the unused_bgs path.  Otherwise, pass it onto the
456   * appropriate filter list or let it fall off.
457   */
btrfs_finish_discard_pass(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)458  static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
459  				      struct btrfs_block_group *block_group)
460  {
461  	remove_from_discard_list(discard_ctl, block_group);
462  
463  	if (block_group->used == 0) {
464  		if (btrfs_is_free_space_trimmed(block_group))
465  			btrfs_mark_bg_unused(block_group);
466  		else
467  			add_to_discard_unused_list(discard_ctl, block_group);
468  	} else {
469  		btrfs_update_discard_index(discard_ctl, block_group);
470  	}
471  }
472  
473  /*
474   * Discard work queue callback
475   *
476   * @work: work
477   *
478   * Find the next block_group to start discarding and then discard a single
479   * region.  It does this in a two-pass fashion: first extents and second
480   * bitmaps.  Completely discarded block groups are sent to the unused_bgs path.
481   */
btrfs_discard_workfn(struct work_struct * work)482  static void btrfs_discard_workfn(struct work_struct *work)
483  {
484  	struct btrfs_discard_ctl *discard_ctl;
485  	struct btrfs_block_group *block_group;
486  	enum btrfs_discard_state discard_state;
487  	int discard_index = 0;
488  	u64 trimmed = 0;
489  	u64 minlen = 0;
490  	u64 now = ktime_get_ns();
491  
492  	discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
493  
494  	block_group = peek_discard_list(discard_ctl, &discard_state,
495  					&discard_index, now);
496  	if (!block_group || !btrfs_run_discard_work(discard_ctl))
497  		return;
498  	if (now < block_group->discard_eligible_time) {
499  		btrfs_discard_schedule_work(discard_ctl, false);
500  		return;
501  	}
502  
503  	/* Perform discarding */
504  	minlen = discard_minlen[discard_index];
505  
506  	if (discard_state == BTRFS_DISCARD_BITMAPS) {
507  		u64 maxlen = 0;
508  
509  		/*
510  		 * Use the previous levels minimum discard length as the max
511  		 * length filter.  In the case something is added to make a
512  		 * region go beyond the max filter, the entire bitmap is set
513  		 * back to BTRFS_TRIM_STATE_UNTRIMMED.
514  		 */
515  		if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
516  			maxlen = discard_minlen[discard_index - 1];
517  
518  		btrfs_trim_block_group_bitmaps(block_group, &trimmed,
519  				       block_group->discard_cursor,
520  				       btrfs_block_group_end(block_group),
521  				       minlen, maxlen, true);
522  		discard_ctl->discard_bitmap_bytes += trimmed;
523  	} else {
524  		btrfs_trim_block_group_extents(block_group, &trimmed,
525  				       block_group->discard_cursor,
526  				       btrfs_block_group_end(block_group),
527  				       minlen, true);
528  		discard_ctl->discard_extent_bytes += trimmed;
529  	}
530  
531  	/* Determine next steps for a block_group */
532  	if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
533  		if (discard_state == BTRFS_DISCARD_BITMAPS) {
534  			btrfs_finish_discard_pass(discard_ctl, block_group);
535  		} else {
536  			block_group->discard_cursor = block_group->start;
537  			spin_lock(&discard_ctl->lock);
538  			if (block_group->discard_state !=
539  			    BTRFS_DISCARD_RESET_CURSOR)
540  				block_group->discard_state =
541  							BTRFS_DISCARD_BITMAPS;
542  			spin_unlock(&discard_ctl->lock);
543  		}
544  	}
545  
546  	now = ktime_get_ns();
547  	spin_lock(&discard_ctl->lock);
548  	discard_ctl->prev_discard = trimmed;
549  	discard_ctl->prev_discard_time = now;
550  	/*
551  	 * If the block group was removed from the discard list while it was
552  	 * running in this workfn, then we didn't deref it, since this function
553  	 * still owned that reference. But we set the discard_ctl->block_group
554  	 * back to NULL, so we can use that condition to know that now we need
555  	 * to deref the block_group.
556  	 */
557  	if (discard_ctl->block_group == NULL)
558  		btrfs_put_block_group(block_group);
559  	discard_ctl->block_group = NULL;
560  	__btrfs_discard_schedule_work(discard_ctl, now, false);
561  	spin_unlock(&discard_ctl->lock);
562  }
563  
564  /*
565   * Recalculate the base delay.
566   *
567   * @discard_ctl: discard control
568   *
569   * Recalculate the base delay which is based off the total number of
570   * discardable_extents.  Clamp this between the lower_limit (iops_limit or 1ms)
571   * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
572   */
btrfs_discard_calc_delay(struct btrfs_discard_ctl * discard_ctl)573  void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
574  {
575  	s32 discardable_extents;
576  	s64 discardable_bytes;
577  	u32 iops_limit;
578  	unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
579  	unsigned long delay;
580  
581  	discardable_extents = atomic_read(&discard_ctl->discardable_extents);
582  	if (!discardable_extents)
583  		return;
584  
585  	spin_lock(&discard_ctl->lock);
586  
587  	/*
588  	 * The following is to fix a potential -1 discrepancy that we're not
589  	 * sure how to reproduce. But given that this is the only place that
590  	 * utilizes these numbers and this is only called by from
591  	 * btrfs_finish_extent_commit() which is synchronized, we can correct
592  	 * here.
593  	 */
594  	if (discardable_extents < 0)
595  		atomic_add(-discardable_extents,
596  			   &discard_ctl->discardable_extents);
597  
598  	discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
599  	if (discardable_bytes < 0)
600  		atomic64_add(-discardable_bytes,
601  			     &discard_ctl->discardable_bytes);
602  
603  	if (discardable_extents <= 0) {
604  		spin_unlock(&discard_ctl->lock);
605  		return;
606  	}
607  
608  	iops_limit = READ_ONCE(discard_ctl->iops_limit);
609  
610  	if (iops_limit) {
611  		delay = MSEC_PER_SEC / iops_limit;
612  	} else {
613  		/*
614  		 * Unset iops_limit means go as fast as possible, so allow a
615  		 * delay of 0.
616  		 */
617  		delay = 0;
618  		min_delay = 0;
619  	}
620  
621  	delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
622  	discard_ctl->delay_ms = delay;
623  
624  	spin_unlock(&discard_ctl->lock);
625  }
626  
627  /*
628   * Propagate discard counters.
629   *
630   * @block_group: block_group of interest
631   *
632   * Propagate deltas of counters up to the discard_ctl.  It maintains a current
633   * counter and a previous counter passing the delta up to the global stat.
634   * Then the current counter value becomes the previous counter value.
635   */
btrfs_discard_update_discardable(struct btrfs_block_group * block_group)636  void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
637  {
638  	struct btrfs_free_space_ctl *ctl;
639  	struct btrfs_discard_ctl *discard_ctl;
640  	s32 extents_delta;
641  	s64 bytes_delta;
642  
643  	if (!block_group ||
644  	    !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
645  	    !btrfs_is_block_group_data_only(block_group))
646  		return;
647  
648  	ctl = block_group->free_space_ctl;
649  	discard_ctl = &block_group->fs_info->discard_ctl;
650  
651  	lockdep_assert_held(&ctl->tree_lock);
652  	extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
653  			ctl->discardable_extents[BTRFS_STAT_PREV];
654  	if (extents_delta) {
655  		atomic_add(extents_delta, &discard_ctl->discardable_extents);
656  		ctl->discardable_extents[BTRFS_STAT_PREV] =
657  			ctl->discardable_extents[BTRFS_STAT_CURR];
658  	}
659  
660  	bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
661  		      ctl->discardable_bytes[BTRFS_STAT_PREV];
662  	if (bytes_delta) {
663  		atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
664  		ctl->discardable_bytes[BTRFS_STAT_PREV] =
665  			ctl->discardable_bytes[BTRFS_STAT_CURR];
666  	}
667  }
668  
669  /*
670   * Punt unused_bgs list to discard lists.
671   *
672   * @fs_info: fs_info of interest
673   *
674   * The unused_bgs list needs to be punted to the discard lists because the
675   * order of operations is changed.  In the normal synchronous discard path, the
676   * block groups are trimmed via a single large trim in transaction commit.  This
677   * is ultimately what we are trying to avoid with asynchronous discard.  Thus,
678   * it must be done before going down the unused_bgs path.
679   */
btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info * fs_info)680  void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
681  {
682  	struct btrfs_block_group *block_group, *next;
683  
684  	spin_lock(&fs_info->unused_bgs_lock);
685  	/* We enabled async discard, so punt all to the queue */
686  	list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
687  				 bg_list) {
688  		list_del_init(&block_group->bg_list);
689  		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
690  		/*
691  		 * This put is for the get done by btrfs_mark_bg_unused.
692  		 * Queueing discard incremented it for discard's reference.
693  		 */
694  		btrfs_put_block_group(block_group);
695  	}
696  	spin_unlock(&fs_info->unused_bgs_lock);
697  }
698  
699  /*
700   * Purge discard lists.
701   *
702   * @discard_ctl: discard control
703   *
704   * If we are disabling async discard, we may have intercepted block groups that
705   * are completely free and ready for the unused_bgs path.  As discarding will
706   * now happen in transaction commit or not at all, we can safely mark the
707   * corresponding block groups as unused and they will be sent on their merry
708   * way to the unused_bgs list.
709   */
btrfs_discard_purge_list(struct btrfs_discard_ctl * discard_ctl)710  static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
711  {
712  	struct btrfs_block_group *block_group, *next;
713  	int i;
714  
715  	spin_lock(&discard_ctl->lock);
716  	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
717  		list_for_each_entry_safe(block_group, next,
718  					 &discard_ctl->discard_list[i],
719  					 discard_list) {
720  			list_del_init(&block_group->discard_list);
721  			spin_unlock(&discard_ctl->lock);
722  			if (block_group->used == 0)
723  				btrfs_mark_bg_unused(block_group);
724  			spin_lock(&discard_ctl->lock);
725  			btrfs_put_block_group(block_group);
726  		}
727  	}
728  	spin_unlock(&discard_ctl->lock);
729  }
730  
btrfs_discard_resume(struct btrfs_fs_info * fs_info)731  void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
732  {
733  	if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
734  		btrfs_discard_cleanup(fs_info);
735  		return;
736  	}
737  
738  	btrfs_discard_punt_unused_bgs_list(fs_info);
739  
740  	set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
741  }
742  
btrfs_discard_stop(struct btrfs_fs_info * fs_info)743  void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
744  {
745  	clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
746  }
747  
btrfs_discard_init(struct btrfs_fs_info * fs_info)748  void btrfs_discard_init(struct btrfs_fs_info *fs_info)
749  {
750  	struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
751  	int i;
752  
753  	spin_lock_init(&discard_ctl->lock);
754  	INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
755  
756  	for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
757  		INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
758  
759  	discard_ctl->prev_discard = 0;
760  	discard_ctl->prev_discard_time = 0;
761  	atomic_set(&discard_ctl->discardable_extents, 0);
762  	atomic64_set(&discard_ctl->discardable_bytes, 0);
763  	discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
764  	discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
765  	discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
766  	discard_ctl->kbps_limit = 0;
767  	discard_ctl->discard_extent_bytes = 0;
768  	discard_ctl->discard_bitmap_bytes = 0;
769  	atomic64_set(&discard_ctl->discard_bytes_saved, 0);
770  }
771  
btrfs_discard_cleanup(struct btrfs_fs_info * fs_info)772  void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
773  {
774  	btrfs_discard_stop(fs_info);
775  	cancel_delayed_work_sync(&fs_info->discard_ctl.work);
776  	btrfs_discard_purge_list(&fs_info->discard_ctl);
777  }
778