1  // SPDX-License-Identifier: GPL-2.0
2  
3  #include "bcachefs.h"
4  #include "bkey_methods.h"
5  #include "bkey_sort.h"
6  #include "btree_cache.h"
7  #include "btree_io.h"
8  #include "btree_iter.h"
9  #include "btree_locking.h"
10  #include "btree_update.h"
11  #include "btree_update_interior.h"
12  #include "buckets.h"
13  #include "checksum.h"
14  #include "debug.h"
15  #include "error.h"
16  #include "extents.h"
17  #include "io_write.h"
18  #include "journal_reclaim.h"
19  #include "journal_seq_blacklist.h"
20  #include "recovery.h"
21  #include "super-io.h"
22  #include "trace.h"
23  
24  #include <linux/sched/mm.h>
25  
bch2_btree_node_header_to_text(struct printbuf * out,struct btree_node * bn)26  static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
27  {
28  	prt_printf(out, "btree=%s l=%u seq %llux\n",
29  		   bch2_btree_id_str(BTREE_NODE_ID(bn)),
30  		   (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq);
31  	prt_str(out, "min: ");
32  	bch2_bpos_to_text(out, bn->min_key);
33  	prt_newline(out);
34  	prt_str(out, "max: ");
35  	bch2_bpos_to_text(out, bn->max_key);
36  }
37  
bch2_btree_node_io_unlock(struct btree * b)38  void bch2_btree_node_io_unlock(struct btree *b)
39  {
40  	EBUG_ON(!btree_node_write_in_flight(b));
41  
42  	clear_btree_node_write_in_flight_inner(b);
43  	clear_btree_node_write_in_flight(b);
44  	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
45  }
46  
bch2_btree_node_io_lock(struct btree * b)47  void bch2_btree_node_io_lock(struct btree *b)
48  {
49  	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
50  			    TASK_UNINTERRUPTIBLE);
51  }
52  
__bch2_btree_node_wait_on_read(struct btree * b)53  void __bch2_btree_node_wait_on_read(struct btree *b)
54  {
55  	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
56  		       TASK_UNINTERRUPTIBLE);
57  }
58  
__bch2_btree_node_wait_on_write(struct btree * b)59  void __bch2_btree_node_wait_on_write(struct btree *b)
60  {
61  	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
62  		       TASK_UNINTERRUPTIBLE);
63  }
64  
bch2_btree_node_wait_on_read(struct btree * b)65  void bch2_btree_node_wait_on_read(struct btree *b)
66  {
67  	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
68  		       TASK_UNINTERRUPTIBLE);
69  }
70  
bch2_btree_node_wait_on_write(struct btree * b)71  void bch2_btree_node_wait_on_write(struct btree *b)
72  {
73  	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
74  		       TASK_UNINTERRUPTIBLE);
75  }
76  
verify_no_dups(struct btree * b,struct bkey_packed * start,struct bkey_packed * end)77  static void verify_no_dups(struct btree *b,
78  			   struct bkey_packed *start,
79  			   struct bkey_packed *end)
80  {
81  #ifdef CONFIG_BCACHEFS_DEBUG
82  	struct bkey_packed *k, *p;
83  
84  	if (start == end)
85  		return;
86  
87  	for (p = start, k = bkey_p_next(start);
88  	     k != end;
89  	     p = k, k = bkey_p_next(k)) {
90  		struct bkey l = bkey_unpack_key(b, p);
91  		struct bkey r = bkey_unpack_key(b, k);
92  
93  		BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
94  	}
95  #endif
96  }
97  
set_needs_whiteout(struct bset * i,int v)98  static void set_needs_whiteout(struct bset *i, int v)
99  {
100  	struct bkey_packed *k;
101  
102  	for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
103  		k->needs_whiteout = v;
104  }
105  
btree_bounce_free(struct bch_fs * c,size_t size,bool used_mempool,void * p)106  static void btree_bounce_free(struct bch_fs *c, size_t size,
107  			      bool used_mempool, void *p)
108  {
109  	if (used_mempool)
110  		mempool_free(p, &c->btree_bounce_pool);
111  	else
112  		kvfree(p);
113  }
114  
btree_bounce_alloc(struct bch_fs * c,size_t size,bool * used_mempool)115  static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
116  				bool *used_mempool)
117  {
118  	unsigned flags = memalloc_nofs_save();
119  	void *p;
120  
121  	BUG_ON(size > c->opts.btree_node_size);
122  
123  	*used_mempool = false;
124  	p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
125  	if (!p) {
126  		*used_mempool = true;
127  		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
128  	}
129  	memalloc_nofs_restore(flags);
130  	return p;
131  }
132  
sort_bkey_ptrs(const struct btree * bt,struct bkey_packed ** ptrs,unsigned nr)133  static void sort_bkey_ptrs(const struct btree *bt,
134  			   struct bkey_packed **ptrs, unsigned nr)
135  {
136  	unsigned n = nr, a = nr / 2, b, c, d;
137  
138  	if (!a)
139  		return;
140  
141  	/* Heap sort: see lib/sort.c: */
142  	while (1) {
143  		if (a)
144  			a--;
145  		else if (--n)
146  			swap(ptrs[0], ptrs[n]);
147  		else
148  			break;
149  
150  		for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
151  			b = bch2_bkey_cmp_packed(bt,
152  					    ptrs[c],
153  					    ptrs[d]) >= 0 ? c : d;
154  		if (d == n)
155  			b = c;
156  
157  		while (b != a &&
158  		       bch2_bkey_cmp_packed(bt,
159  				       ptrs[a],
160  				       ptrs[b]) >= 0)
161  			b = (b - 1) / 2;
162  		c = b;
163  		while (b != a) {
164  			b = (b - 1) / 2;
165  			swap(ptrs[b], ptrs[c]);
166  		}
167  	}
168  }
169  
bch2_sort_whiteouts(struct bch_fs * c,struct btree * b)170  static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
171  {
172  	struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
173  	bool used_mempool = false;
174  	size_t bytes = b->whiteout_u64s * sizeof(u64);
175  
176  	if (!b->whiteout_u64s)
177  		return;
178  
179  	new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
180  
181  	ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
182  
183  	for (k = unwritten_whiteouts_start(b);
184  	     k != unwritten_whiteouts_end(b);
185  	     k = bkey_p_next(k))
186  		*--ptrs = k;
187  
188  	sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
189  
190  	k = new_whiteouts;
191  
192  	while (ptrs != ptrs_end) {
193  		bkey_p_copy(k, *ptrs);
194  		k = bkey_p_next(k);
195  		ptrs++;
196  	}
197  
198  	verify_no_dups(b, new_whiteouts,
199  		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
200  
201  	memcpy_u64s(unwritten_whiteouts_start(b),
202  		    new_whiteouts, b->whiteout_u64s);
203  
204  	btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
205  }
206  
should_compact_bset(struct btree * b,struct bset_tree * t,bool compacting,enum compact_mode mode)207  static bool should_compact_bset(struct btree *b, struct bset_tree *t,
208  				bool compacting, enum compact_mode mode)
209  {
210  	if (!bset_dead_u64s(b, t))
211  		return false;
212  
213  	switch (mode) {
214  	case COMPACT_LAZY:
215  		return should_compact_bset_lazy(b, t) ||
216  			(compacting && !bset_written(b, bset(b, t)));
217  	case COMPACT_ALL:
218  		return true;
219  	default:
220  		BUG();
221  	}
222  }
223  
bch2_drop_whiteouts(struct btree * b,enum compact_mode mode)224  static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
225  {
226  	bool ret = false;
227  
228  	for_each_bset(b, t) {
229  		struct bset *i = bset(b, t);
230  		struct bkey_packed *k, *n, *out, *start, *end;
231  		struct btree_node_entry *src = NULL, *dst = NULL;
232  
233  		if (t != b->set && !bset_written(b, i)) {
234  			src = container_of(i, struct btree_node_entry, keys);
235  			dst = max(write_block(b),
236  				  (void *) btree_bkey_last(b, t - 1));
237  		}
238  
239  		if (src != dst)
240  			ret = true;
241  
242  		if (!should_compact_bset(b, t, ret, mode)) {
243  			if (src != dst) {
244  				memmove(dst, src, sizeof(*src) +
245  					le16_to_cpu(src->keys.u64s) *
246  					sizeof(u64));
247  				i = &dst->keys;
248  				set_btree_bset(b, t, i);
249  			}
250  			continue;
251  		}
252  
253  		start	= btree_bkey_first(b, t);
254  		end	= btree_bkey_last(b, t);
255  
256  		if (src != dst) {
257  			memmove(dst, src, sizeof(*src));
258  			i = &dst->keys;
259  			set_btree_bset(b, t, i);
260  		}
261  
262  		out = i->start;
263  
264  		for (k = start; k != end; k = n) {
265  			n = bkey_p_next(k);
266  
267  			if (!bkey_deleted(k)) {
268  				bkey_p_copy(out, k);
269  				out = bkey_p_next(out);
270  			} else {
271  				BUG_ON(k->needs_whiteout);
272  			}
273  		}
274  
275  		i->u64s = cpu_to_le16((u64 *) out - i->_data);
276  		set_btree_bset_end(b, t);
277  		bch2_bset_set_no_aux_tree(b, t);
278  		ret = true;
279  	}
280  
281  	bch2_verify_btree_nr_keys(b);
282  
283  	bch2_btree_build_aux_trees(b);
284  
285  	return ret;
286  }
287  
bch2_compact_whiteouts(struct bch_fs * c,struct btree * b,enum compact_mode mode)288  bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
289  			    enum compact_mode mode)
290  {
291  	return bch2_drop_whiteouts(b, mode);
292  }
293  
btree_node_sort(struct bch_fs * c,struct btree * b,unsigned start_idx,unsigned end_idx)294  static void btree_node_sort(struct bch_fs *c, struct btree *b,
295  			    unsigned start_idx,
296  			    unsigned end_idx)
297  {
298  	struct btree_node *out;
299  	struct sort_iter_stack sort_iter;
300  	struct bset_tree *t;
301  	struct bset *start_bset = bset(b, &b->set[start_idx]);
302  	bool used_mempool = false;
303  	u64 start_time, seq = 0;
304  	unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
305  	bool sorting_entire_node = start_idx == 0 &&
306  		end_idx == b->nsets;
307  
308  	sort_iter_stack_init(&sort_iter, b);
309  
310  	for (t = b->set + start_idx;
311  	     t < b->set + end_idx;
312  	     t++) {
313  		u64s += le16_to_cpu(bset(b, t)->u64s);
314  		sort_iter_add(&sort_iter.iter,
315  			      btree_bkey_first(b, t),
316  			      btree_bkey_last(b, t));
317  	}
318  
319  	bytes = sorting_entire_node
320  		? btree_buf_bytes(b)
321  		: __vstruct_bytes(struct btree_node, u64s);
322  
323  	out = btree_bounce_alloc(c, bytes, &used_mempool);
324  
325  	start_time = local_clock();
326  
327  	u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter);
328  
329  	out->keys.u64s = cpu_to_le16(u64s);
330  
331  	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
332  
333  	if (sorting_entire_node)
334  		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
335  				       start_time);
336  
337  	/* Make sure we preserve bset journal_seq: */
338  	for (t = b->set + start_idx; t < b->set + end_idx; t++)
339  		seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
340  	start_bset->journal_seq = cpu_to_le64(seq);
341  
342  	if (sorting_entire_node) {
343  		u64s = le16_to_cpu(out->keys.u64s);
344  
345  		BUG_ON(bytes != btree_buf_bytes(b));
346  
347  		/*
348  		 * Our temporary buffer is the same size as the btree node's
349  		 * buffer, we can just swap buffers instead of doing a big
350  		 * memcpy()
351  		 */
352  		*out = *b->data;
353  		out->keys.u64s = cpu_to_le16(u64s);
354  		swap(out, b->data);
355  		set_btree_bset(b, b->set, &b->data->keys);
356  	} else {
357  		start_bset->u64s = out->keys.u64s;
358  		memcpy_u64s(start_bset->start,
359  			    out->keys.start,
360  			    le16_to_cpu(out->keys.u64s));
361  	}
362  
363  	for (i = start_idx + 1; i < end_idx; i++)
364  		b->nr.bset_u64s[start_idx] +=
365  			b->nr.bset_u64s[i];
366  
367  	b->nsets -= shift;
368  
369  	for (i = start_idx + 1; i < b->nsets; i++) {
370  		b->nr.bset_u64s[i]	= b->nr.bset_u64s[i + shift];
371  		b->set[i]		= b->set[i + shift];
372  	}
373  
374  	for (i = b->nsets; i < MAX_BSETS; i++)
375  		b->nr.bset_u64s[i] = 0;
376  
377  	set_btree_bset_end(b, &b->set[start_idx]);
378  	bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
379  
380  	btree_bounce_free(c, bytes, used_mempool, out);
381  
382  	bch2_verify_btree_nr_keys(b);
383  }
384  
bch2_btree_sort_into(struct bch_fs * c,struct btree * dst,struct btree * src)385  void bch2_btree_sort_into(struct bch_fs *c,
386  			 struct btree *dst,
387  			 struct btree *src)
388  {
389  	struct btree_nr_keys nr;
390  	struct btree_node_iter src_iter;
391  	u64 start_time = local_clock();
392  
393  	BUG_ON(dst->nsets != 1);
394  
395  	bch2_bset_set_no_aux_tree(dst, dst->set);
396  
397  	bch2_btree_node_iter_init_from_start(&src_iter, src);
398  
399  	nr = bch2_sort_repack(btree_bset_first(dst),
400  			src, &src_iter,
401  			&dst->format,
402  			true);
403  
404  	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
405  			       start_time);
406  
407  	set_btree_bset_end(dst, dst->set);
408  
409  	dst->nr.live_u64s	+= nr.live_u64s;
410  	dst->nr.bset_u64s[0]	+= nr.bset_u64s[0];
411  	dst->nr.packed_keys	+= nr.packed_keys;
412  	dst->nr.unpacked_keys	+= nr.unpacked_keys;
413  
414  	bch2_verify_btree_nr_keys(dst);
415  }
416  
417  /*
418   * We're about to add another bset to the btree node, so if there's currently
419   * too many bsets - sort some of them together:
420   */
btree_node_compact(struct bch_fs * c,struct btree * b)421  static bool btree_node_compact(struct bch_fs *c, struct btree *b)
422  {
423  	unsigned unwritten_idx;
424  	bool ret = false;
425  
426  	for (unwritten_idx = 0;
427  	     unwritten_idx < b->nsets;
428  	     unwritten_idx++)
429  		if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
430  			break;
431  
432  	if (b->nsets - unwritten_idx > 1) {
433  		btree_node_sort(c, b, unwritten_idx, b->nsets);
434  		ret = true;
435  	}
436  
437  	if (unwritten_idx > 1) {
438  		btree_node_sort(c, b, 0, unwritten_idx);
439  		ret = true;
440  	}
441  
442  	return ret;
443  }
444  
bch2_btree_build_aux_trees(struct btree * b)445  void bch2_btree_build_aux_trees(struct btree *b)
446  {
447  	for_each_bset(b, t)
448  		bch2_bset_build_aux_tree(b, t,
449  				!bset_written(b, bset(b, t)) &&
450  				t == bset_tree_last(b));
451  }
452  
453  /*
454   * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
455   *
456   * The first bset is going to be of similar order to the size of the node, the
457   * last bset is bounded by btree_write_set_buffer(), which is set to keep the
458   * memmove on insert from being too expensive: the middle bset should, ideally,
459   * be the geometric mean of the first and the last.
460   *
461   * Returns true if the middle bset is greater than that geometric mean:
462   */
should_compact_all(struct bch_fs * c,struct btree * b)463  static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
464  {
465  	unsigned mid_u64s_bits =
466  		(ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
467  
468  	return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
469  }
470  
471  /*
472   * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
473   * inserted into
474   *
475   * Safe to call if there already is an unwritten bset - will only add a new bset
476   * if @b doesn't already have one.
477   *
478   * Returns true if we sorted (i.e. invalidated iterators
479   */
bch2_btree_init_next(struct btree_trans * trans,struct btree * b)480  void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
481  {
482  	struct bch_fs *c = trans->c;
483  	struct btree_node_entry *bne;
484  	bool reinit_iter = false;
485  
486  	EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
487  	BUG_ON(bset_written(b, bset(b, &b->set[1])));
488  	BUG_ON(btree_node_just_written(b));
489  
490  	if (b->nsets == MAX_BSETS &&
491  	    !btree_node_write_in_flight(b) &&
492  	    should_compact_all(c, b)) {
493  		bch2_btree_node_write(c, b, SIX_LOCK_write,
494  				      BTREE_WRITE_init_next_bset);
495  		reinit_iter = true;
496  	}
497  
498  	if (b->nsets == MAX_BSETS &&
499  	    btree_node_compact(c, b))
500  		reinit_iter = true;
501  
502  	BUG_ON(b->nsets >= MAX_BSETS);
503  
504  	bne = want_new_bset(c, b);
505  	if (bne)
506  		bch2_bset_init_next(b, bne);
507  
508  	bch2_btree_build_aux_trees(b);
509  
510  	if (reinit_iter)
511  		bch2_trans_node_reinit_iter(trans, b);
512  }
513  
btree_err_msg(struct printbuf * out,struct bch_fs * c,struct bch_dev * ca,struct btree * b,struct bset * i,struct bkey_packed * k,unsigned offset,int write)514  static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
515  			  struct bch_dev *ca,
516  			  struct btree *b, struct bset *i, struct bkey_packed *k,
517  			  unsigned offset, int write)
518  {
519  	prt_printf(out, bch2_log_msg(c, "%s"),
520  		   write == READ
521  		   ? "error validating btree node "
522  		   : "corrupt btree node before write ");
523  	if (ca)
524  		prt_printf(out, "on %s ", ca->name);
525  	prt_printf(out, "at btree ");
526  	bch2_btree_pos_to_text(out, c, b);
527  
528  	printbuf_indent_add(out, 2);
529  
530  	prt_printf(out, "\nnode offset %u/%u",
531  		   b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
532  	if (i)
533  		prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
534  	if (k)
535  		prt_printf(out, " bset byte offset %lu",
536  			   (unsigned long)(void *)k -
537  			   ((unsigned long)(void *)i & ~511UL));
538  	prt_str(out, ": ");
539  }
540  
541  __printf(10, 11)
__btree_err(int ret,struct bch_fs * c,struct bch_dev * ca,struct btree * b,struct bset * i,struct bkey_packed * k,int write,bool have_retry,enum bch_sb_error_id err_type,const char * fmt,...)542  static int __btree_err(int ret,
543  		       struct bch_fs *c,
544  		       struct bch_dev *ca,
545  		       struct btree *b,
546  		       struct bset *i,
547  		       struct bkey_packed *k,
548  		       int write,
549  		       bool have_retry,
550  		       enum bch_sb_error_id err_type,
551  		       const char *fmt, ...)
552  {
553  	struct printbuf out = PRINTBUF;
554  	bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
555  	va_list args;
556  
557  	btree_err_msg(&out, c, ca, b, i, k, b->written, write);
558  
559  	va_start(args, fmt);
560  	prt_vprintf(&out, fmt, args);
561  	va_end(args);
562  
563  	if (write == WRITE) {
564  		bch2_print_string_as_lines(KERN_ERR, out.buf);
565  		ret = c->opts.errors == BCH_ON_ERROR_continue
566  			? 0
567  			: -BCH_ERR_fsck_errors_not_fixed;
568  		goto out;
569  	}
570  
571  	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
572  		ret = -BCH_ERR_btree_node_read_err_fixable;
573  	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
574  		ret = -BCH_ERR_btree_node_read_err_bad_node;
575  
576  	if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
577  		bch2_sb_error_count(c, err_type);
578  
579  	switch (ret) {
580  	case -BCH_ERR_btree_node_read_err_fixable:
581  		ret = !silent
582  			? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
583  			: -BCH_ERR_fsck_fix;
584  		if (ret != -BCH_ERR_fsck_fix &&
585  		    ret != -BCH_ERR_fsck_ignore)
586  			goto fsck_err;
587  		ret = -BCH_ERR_fsck_fix;
588  		break;
589  	case -BCH_ERR_btree_node_read_err_want_retry:
590  	case -BCH_ERR_btree_node_read_err_must_retry:
591  		if (!silent)
592  			bch2_print_string_as_lines(KERN_ERR, out.buf);
593  		break;
594  	case -BCH_ERR_btree_node_read_err_bad_node:
595  		if (!silent)
596  			bch2_print_string_as_lines(KERN_ERR, out.buf);
597  		ret = bch2_topology_error(c);
598  		break;
599  	case -BCH_ERR_btree_node_read_err_incompatible:
600  		if (!silent)
601  			bch2_print_string_as_lines(KERN_ERR, out.buf);
602  		ret = -BCH_ERR_fsck_errors_not_fixed;
603  		break;
604  	default:
605  		BUG();
606  	}
607  out:
608  fsck_err:
609  	printbuf_exit(&out);
610  	return ret;
611  }
612  
613  #define btree_err(type, c, ca, b, i, k, _err_type, msg, ...)		\
614  ({									\
615  	int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry,	\
616  			       BCH_FSCK_ERR_##_err_type,		\
617  			       msg, ##__VA_ARGS__);			\
618  									\
619  	if (_ret != -BCH_ERR_fsck_fix) {				\
620  		ret = _ret;						\
621  		goto fsck_err;						\
622  	}								\
623  									\
624  	*saw_error = true;						\
625  })
626  
627  #define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
628  
629  /*
630   * When btree topology repair changes the start or end of a node, that might
631   * mean we have to drop keys that are no longer inside the node:
632   */
633  __cold
bch2_btree_node_drop_keys_outside_node(struct btree * b)634  void bch2_btree_node_drop_keys_outside_node(struct btree *b)
635  {
636  	for_each_bset(b, t) {
637  		struct bset *i = bset(b, t);
638  		struct bkey_packed *k;
639  
640  		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
641  			if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
642  				break;
643  
644  		if (k != i->start) {
645  			unsigned shift = (u64 *) k - (u64 *) i->start;
646  
647  			memmove_u64s_down(i->start, k,
648  					  (u64 *) vstruct_end(i) - (u64 *) k);
649  			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
650  			set_btree_bset_end(b, t);
651  		}
652  
653  		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
654  			if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
655  				break;
656  
657  		if (k != vstruct_last(i)) {
658  			i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
659  			set_btree_bset_end(b, t);
660  		}
661  	}
662  
663  	/*
664  	 * Always rebuild search trees: eytzinger search tree nodes directly
665  	 * depend on the values of min/max key:
666  	 */
667  	bch2_bset_set_no_aux_tree(b, b->set);
668  	bch2_btree_build_aux_trees(b);
669  	b->nr = bch2_btree_node_count_keys(b);
670  
671  	struct bkey_s_c k;
672  	struct bkey unpacked;
673  	struct btree_node_iter iter;
674  	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
675  		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
676  		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
677  	}
678  }
679  
validate_bset(struct bch_fs * c,struct bch_dev * ca,struct btree * b,struct bset * i,unsigned offset,unsigned sectors,int write,bool have_retry,bool * saw_error)680  static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
681  			 struct btree *b, struct bset *i,
682  			 unsigned offset, unsigned sectors,
683  			 int write, bool have_retry, bool *saw_error)
684  {
685  	unsigned version = le16_to_cpu(i->version);
686  	unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
687  	struct printbuf buf1 = PRINTBUF;
688  	struct printbuf buf2 = PRINTBUF;
689  	int ret = 0;
690  
691  	btree_err_on(!bch2_version_compatible(version),
692  		     -BCH_ERR_btree_node_read_err_incompatible,
693  		     c, ca, b, i, NULL,
694  		     btree_node_unsupported_version,
695  		     "unsupported bset version %u.%u",
696  		     BCH_VERSION_MAJOR(version),
697  		     BCH_VERSION_MINOR(version));
698  
699  	if (btree_err_on(version < c->sb.version_min,
700  			 -BCH_ERR_btree_node_read_err_fixable,
701  			 c, NULL, b, i, NULL,
702  			 btree_node_bset_older_than_sb_min,
703  			 "bset version %u older than superblock version_min %u",
704  			 version, c->sb.version_min)) {
705  		mutex_lock(&c->sb_lock);
706  		c->disk_sb.sb->version_min = cpu_to_le16(version);
707  		bch2_write_super(c);
708  		mutex_unlock(&c->sb_lock);
709  	}
710  
711  	if (btree_err_on(BCH_VERSION_MAJOR(version) >
712  			 BCH_VERSION_MAJOR(c->sb.version),
713  			 -BCH_ERR_btree_node_read_err_fixable,
714  			 c, NULL, b, i, NULL,
715  			 btree_node_bset_newer_than_sb,
716  			 "bset version %u newer than superblock version %u",
717  			 version, c->sb.version)) {
718  		mutex_lock(&c->sb_lock);
719  		c->disk_sb.sb->version = cpu_to_le16(version);
720  		bch2_write_super(c);
721  		mutex_unlock(&c->sb_lock);
722  	}
723  
724  	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
725  		     -BCH_ERR_btree_node_read_err_incompatible,
726  		     c, ca, b, i, NULL,
727  		     btree_node_unsupported_version,
728  		     "BSET_SEPARATE_WHITEOUTS no longer supported");
729  
730  	if (!write &&
731  	    btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
732  			 -BCH_ERR_btree_node_read_err_fixable,
733  			 c, ca, b, i, NULL,
734  			 bset_past_end_of_btree_node,
735  			 "bset past end of btree node (offset %u len %u but written %zu)",
736  			 offset, sectors, ptr_written ?: btree_sectors(c)))
737  		i->u64s = 0;
738  
739  	btree_err_on(offset && !i->u64s,
740  		     -BCH_ERR_btree_node_read_err_fixable,
741  		     c, ca, b, i, NULL,
742  		     bset_empty,
743  		     "empty bset");
744  
745  	btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
746  		     -BCH_ERR_btree_node_read_err_want_retry,
747  		     c, ca, b, i, NULL,
748  		     bset_wrong_sector_offset,
749  		     "bset at wrong sector offset");
750  
751  	if (!offset) {
752  		struct btree_node *bn =
753  			container_of(i, struct btree_node, keys);
754  		/* These indicate that we read the wrong btree node: */
755  
756  		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
757  			struct bch_btree_ptr_v2 *bp =
758  				&bkey_i_to_btree_ptr_v2(&b->key)->v;
759  
760  			/* XXX endianness */
761  			btree_err_on(bp->seq != bn->keys.seq,
762  				     -BCH_ERR_btree_node_read_err_must_retry,
763  				     c, ca, b, NULL, NULL,
764  				     bset_bad_seq,
765  				     "incorrect sequence number (wrong btree node)");
766  		}
767  
768  		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
769  			     -BCH_ERR_btree_node_read_err_must_retry,
770  			     c, ca, b, i, NULL,
771  			     btree_node_bad_btree,
772  			     "incorrect btree id");
773  
774  		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
775  			     -BCH_ERR_btree_node_read_err_must_retry,
776  			     c, ca, b, i, NULL,
777  			     btree_node_bad_level,
778  			     "incorrect level");
779  
780  		if (!write)
781  			compat_btree_node(b->c.level, b->c.btree_id, version,
782  					  BSET_BIG_ENDIAN(i), write, bn);
783  
784  		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
785  			struct bch_btree_ptr_v2 *bp =
786  				&bkey_i_to_btree_ptr_v2(&b->key)->v;
787  
788  			if (BTREE_PTR_RANGE_UPDATED(bp)) {
789  				b->data->min_key = bp->min_key;
790  				b->data->max_key = b->key.k.p;
791  			}
792  
793  			btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
794  				     -BCH_ERR_btree_node_read_err_must_retry,
795  				     c, ca, b, NULL, NULL,
796  				     btree_node_bad_min_key,
797  				     "incorrect min_key: got %s should be %s",
798  				     (printbuf_reset(&buf1),
799  				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
800  				     (printbuf_reset(&buf2),
801  				      bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
802  		}
803  
804  		btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
805  			     -BCH_ERR_btree_node_read_err_must_retry,
806  			     c, ca, b, i, NULL,
807  			     btree_node_bad_max_key,
808  			     "incorrect max key %s",
809  			     (printbuf_reset(&buf1),
810  			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
811  
812  		if (write)
813  			compat_btree_node(b->c.level, b->c.btree_id, version,
814  					  BSET_BIG_ENDIAN(i), write, bn);
815  
816  		btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
817  			     -BCH_ERR_btree_node_read_err_bad_node,
818  			     c, ca, b, i, NULL,
819  			     btree_node_bad_format,
820  			     "invalid bkey format: %s\n  %s", buf1.buf,
821  			     (printbuf_reset(&buf2),
822  			      bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
823  		printbuf_reset(&buf1);
824  
825  		compat_bformat(b->c.level, b->c.btree_id, version,
826  			       BSET_BIG_ENDIAN(i), write,
827  			       &bn->format);
828  	}
829  fsck_err:
830  	printbuf_exit(&buf2);
831  	printbuf_exit(&buf1);
832  	return ret;
833  }
834  
bset_key_validate(struct bch_fs * c,struct btree * b,struct bkey_s_c k,bool updated_range,int rw)835  static int bset_key_validate(struct bch_fs *c, struct btree *b,
836  			     struct bkey_s_c k,
837  			     bool updated_range, int rw)
838  {
839  	return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?:
840  		(!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?:
841  		(rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0);
842  }
843  
bkey_packed_valid(struct bch_fs * c,struct btree * b,struct bset * i,struct bkey_packed * k)844  static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
845  			 struct bset *i, struct bkey_packed *k)
846  {
847  	if (bkey_p_next(k) > vstruct_last(i))
848  		return false;
849  
850  	if (k->format > KEY_FORMAT_CURRENT)
851  		return false;
852  
853  	if (!bkeyp_u64s_valid(&b->format, k))
854  		return false;
855  
856  	struct bkey tmp;
857  	struct bkey_s u = __bkey_disassemble(b, k, &tmp);
858  	return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent);
859  }
860  
validate_bset_keys(struct bch_fs * c,struct btree * b,struct bset * i,int write,bool have_retry,bool * saw_error)861  static int validate_bset_keys(struct bch_fs *c, struct btree *b,
862  			 struct bset *i, int write,
863  			 bool have_retry, bool *saw_error)
864  {
865  	unsigned version = le16_to_cpu(i->version);
866  	struct bkey_packed *k, *prev = NULL;
867  	struct printbuf buf = PRINTBUF;
868  	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
869  		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
870  	int ret = 0;
871  
872  	for (k = i->start;
873  	     k != vstruct_last(i);) {
874  		struct bkey_s u;
875  		struct bkey tmp;
876  		unsigned next_good_key;
877  
878  		if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
879  				 -BCH_ERR_btree_node_read_err_fixable,
880  				 c, NULL, b, i, k,
881  				 btree_node_bkey_past_bset_end,
882  				 "key extends past end of bset")) {
883  			i->u64s = cpu_to_le16((u64 *) k - i->_data);
884  			break;
885  		}
886  
887  		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
888  				 -BCH_ERR_btree_node_read_err_fixable,
889  				 c, NULL, b, i, k,
890  				 btree_node_bkey_bad_format,
891  				 "invalid bkey format %u", k->format))
892  			goto drop_this_key;
893  
894  		if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
895  				 -BCH_ERR_btree_node_read_err_fixable,
896  				 c, NULL, b, i, k,
897  				 btree_node_bkey_bad_u64s,
898  				 "bad k->u64s %u (min %u max %zu)", k->u64s,
899  				 bkeyp_key_u64s(&b->format, k),
900  				 U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
901  			goto drop_this_key;
902  
903  		if (!write)
904  			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
905  				    BSET_BIG_ENDIAN(i), write,
906  				    &b->format, k);
907  
908  		u = __bkey_disassemble(b, k, &tmp);
909  
910  		ret = bset_key_validate(c, b, u.s_c, updated_range, write);
911  		if (ret == -BCH_ERR_fsck_delete_bkey)
912  			goto drop_this_key;
913  		if (ret)
914  			goto fsck_err;
915  
916  		if (write)
917  			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
918  				    BSET_BIG_ENDIAN(i), write,
919  				    &b->format, k);
920  
921  		if (prev && bkey_iter_cmp(b, prev, k) > 0) {
922  			struct bkey up = bkey_unpack_key(b, prev);
923  
924  			printbuf_reset(&buf);
925  			prt_printf(&buf, "keys out of order: ");
926  			bch2_bkey_to_text(&buf, &up);
927  			prt_printf(&buf, " > ");
928  			bch2_bkey_to_text(&buf, u.k);
929  
930  			if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
931  				      c, NULL, b, i, k,
932  				      btree_node_bkey_out_of_order,
933  				      "%s", buf.buf))
934  				goto drop_this_key;
935  		}
936  
937  		prev = k;
938  		k = bkey_p_next(k);
939  		continue;
940  drop_this_key:
941  		next_good_key = k->u64s;
942  
943  		if (!next_good_key ||
944  		    (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
945  		     version >= bcachefs_metadata_version_snapshot)) {
946  			/*
947  			 * only do scanning if bch2_bkey_compat() has nothing to
948  			 * do
949  			 */
950  
951  			if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
952  				for (next_good_key = 1;
953  				     next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
954  				     next_good_key++)
955  					if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
956  						goto got_good_key;
957  			}
958  
959  			/*
960  			 * didn't find a good key, have to truncate the rest of
961  			 * the bset
962  			 */
963  			next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
964  		}
965  got_good_key:
966  		le16_add_cpu(&i->u64s, -next_good_key);
967  		memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
968  	}
969  fsck_err:
970  	printbuf_exit(&buf);
971  	return ret;
972  }
973  
bch2_btree_node_read_done(struct bch_fs * c,struct bch_dev * ca,struct btree * b,bool have_retry,bool * saw_error)974  int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
975  			      struct btree *b, bool have_retry, bool *saw_error)
976  {
977  	struct btree_node_entry *bne;
978  	struct sort_iter *iter;
979  	struct btree_node *sorted;
980  	struct bkey_packed *k;
981  	struct bset *i;
982  	bool used_mempool, blacklisted;
983  	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
984  		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
985  	unsigned u64s;
986  	unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
987  	u64 max_journal_seq = 0;
988  	struct printbuf buf = PRINTBUF;
989  	int ret = 0, retry_read = 0, write = READ;
990  	u64 start_time = local_clock();
991  
992  	b->version_ondisk = U16_MAX;
993  	/* We might get called multiple times on read retry: */
994  	b->written = 0;
995  
996  	iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
997  	sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
998  
999  	if (bch2_meta_read_fault("btree"))
1000  		btree_err(-BCH_ERR_btree_node_read_err_must_retry,
1001  			  c, ca, b, NULL, NULL,
1002  			  btree_node_fault_injected,
1003  			  "dynamic fault");
1004  
1005  	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
1006  		     -BCH_ERR_btree_node_read_err_must_retry,
1007  		     c, ca, b, NULL, NULL,
1008  		     btree_node_bad_magic,
1009  		     "bad magic: want %llx, got %llx",
1010  		     bset_magic(c), le64_to_cpu(b->data->magic));
1011  
1012  	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
1013  		struct bch_btree_ptr_v2 *bp =
1014  			&bkey_i_to_btree_ptr_v2(&b->key)->v;
1015  
1016  		bch2_bpos_to_text(&buf, b->data->min_key);
1017  		prt_str(&buf, "-");
1018  		bch2_bpos_to_text(&buf, b->data->max_key);
1019  
1020  		btree_err_on(b->data->keys.seq != bp->seq,
1021  			     -BCH_ERR_btree_node_read_err_must_retry,
1022  			     c, ca, b, NULL, NULL,
1023  			     btree_node_bad_seq,
1024  			     "got wrong btree node: got\n%s",
1025  			     (printbuf_reset(&buf),
1026  			      bch2_btree_node_header_to_text(&buf, b->data),
1027  			      buf.buf));
1028  	} else {
1029  		btree_err_on(!b->data->keys.seq,
1030  			     -BCH_ERR_btree_node_read_err_must_retry,
1031  			     c, ca, b, NULL, NULL,
1032  			     btree_node_bad_seq,
1033  			     "bad btree header: seq 0\n%s",
1034  			     (printbuf_reset(&buf),
1035  			      bch2_btree_node_header_to_text(&buf, b->data),
1036  			      buf.buf));
1037  	}
1038  
1039  	while (b->written < (ptr_written ?: btree_sectors(c))) {
1040  		unsigned sectors;
1041  		struct nonce nonce;
1042  		bool first = !b->written;
1043  		bool csum_bad;
1044  
1045  		if (!b->written) {
1046  			i = &b->data->keys;
1047  
1048  			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
1049  				     -BCH_ERR_btree_node_read_err_want_retry,
1050  				     c, ca, b, i, NULL,
1051  				     bset_unknown_csum,
1052  				     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
1053  
1054  			nonce = btree_nonce(i, b->written << 9);
1055  
1056  			struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
1057  			csum_bad = bch2_crc_cmp(b->data->csum, csum);
1058  			if (csum_bad)
1059  				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
1060  
1061  			btree_err_on(csum_bad,
1062  				     -BCH_ERR_btree_node_read_err_want_retry,
1063  				     c, ca, b, i, NULL,
1064  				     bset_bad_csum,
1065  				     "%s",
1066  				     (printbuf_reset(&buf),
1067  				      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
1068  				      buf.buf));
1069  
1070  			ret = bset_encrypt(c, i, b->written << 9);
1071  			if (bch2_fs_fatal_err_on(ret, c,
1072  					"decrypting btree node: %s", bch2_err_str(ret)))
1073  				goto fsck_err;
1074  
1075  			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
1076  				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
1077  				     -BCH_ERR_btree_node_read_err_incompatible,
1078  				     c, NULL, b, NULL, NULL,
1079  				     btree_node_unsupported_version,
1080  				     "btree node does not have NEW_EXTENT_OVERWRITE set");
1081  
1082  			sectors = vstruct_sectors(b->data, c->block_bits);
1083  		} else {
1084  			bne = write_block(b);
1085  			i = &bne->keys;
1086  
1087  			if (i->seq != b->data->keys.seq)
1088  				break;
1089  
1090  			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
1091  				     -BCH_ERR_btree_node_read_err_want_retry,
1092  				     c, ca, b, i, NULL,
1093  				     bset_unknown_csum,
1094  				     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
1095  
1096  			nonce = btree_nonce(i, b->written << 9);
1097  			struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1098  			csum_bad = bch2_crc_cmp(bne->csum, csum);
1099  			if (ca && csum_bad)
1100  				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
1101  
1102  			btree_err_on(csum_bad,
1103  				     -BCH_ERR_btree_node_read_err_want_retry,
1104  				     c, ca, b, i, NULL,
1105  				     bset_bad_csum,
1106  				     "%s",
1107  				     (printbuf_reset(&buf),
1108  				      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
1109  				      buf.buf));
1110  
1111  			ret = bset_encrypt(c, i, b->written << 9);
1112  			if (bch2_fs_fatal_err_on(ret, c,
1113  					"decrypting btree node: %s", bch2_err_str(ret)))
1114  				goto fsck_err;
1115  
1116  			sectors = vstruct_sectors(bne, c->block_bits);
1117  		}
1118  
1119  		b->version_ondisk = min(b->version_ondisk,
1120  					le16_to_cpu(i->version));
1121  
1122  		ret = validate_bset(c, ca, b, i, b->written, sectors,
1123  				    READ, have_retry, saw_error);
1124  		if (ret)
1125  			goto fsck_err;
1126  
1127  		if (!b->written)
1128  			btree_node_set_format(b, b->data->format);
1129  
1130  		ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
1131  		if (ret)
1132  			goto fsck_err;
1133  
1134  		SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
1135  
1136  		blacklisted = bch2_journal_seq_is_blacklisted(c,
1137  					le64_to_cpu(i->journal_seq),
1138  					true);
1139  
1140  		btree_err_on(blacklisted && first,
1141  			     -BCH_ERR_btree_node_read_err_fixable,
1142  			     c, ca, b, i, NULL,
1143  			     bset_blacklisted_journal_seq,
1144  			     "first btree node bset has blacklisted journal seq (%llu)",
1145  			     le64_to_cpu(i->journal_seq));
1146  
1147  		btree_err_on(blacklisted && ptr_written,
1148  			     -BCH_ERR_btree_node_read_err_fixable,
1149  			     c, ca, b, i, NULL,
1150  			     first_bset_blacklisted_journal_seq,
1151  			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
1152  			     le64_to_cpu(i->journal_seq),
1153  			     b->written, b->written + sectors, ptr_written);
1154  
1155  		b->written += sectors;
1156  
1157  		if (blacklisted && !first)
1158  			continue;
1159  
1160  		sort_iter_add(iter,
1161  			      vstruct_idx(i, 0),
1162  			      vstruct_last(i));
1163  
1164  		max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
1165  	}
1166  
1167  	if (ptr_written) {
1168  		btree_err_on(b->written < ptr_written,
1169  			     -BCH_ERR_btree_node_read_err_want_retry,
1170  			     c, ca, b, NULL, NULL,
1171  			     btree_node_data_missing,
1172  			     "btree node data missing: expected %u sectors, found %u",
1173  			     ptr_written, b->written);
1174  	} else {
1175  		for (bne = write_block(b);
1176  		     bset_byte_offset(b, bne) < btree_buf_bytes(b);
1177  		     bne = (void *) bne + block_bytes(c))
1178  			btree_err_on(bne->keys.seq == b->data->keys.seq &&
1179  				     !bch2_journal_seq_is_blacklisted(c,
1180  								      le64_to_cpu(bne->keys.journal_seq),
1181  								      true),
1182  				     -BCH_ERR_btree_node_read_err_want_retry,
1183  				     c, ca, b, NULL, NULL,
1184  				     btree_node_bset_after_end,
1185  				     "found bset signature after last bset");
1186  	}
1187  
1188  	sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
1189  	sorted->keys.u64s = 0;
1190  
1191  	set_btree_bset(b, b->set, &b->data->keys);
1192  
1193  	b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
1194  	memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
1195  			btree_buf_bytes(b) -
1196  			sizeof(struct btree_node) -
1197  			b->nr.live_u64s * sizeof(u64));
1198  
1199  	u64s = le16_to_cpu(sorted->keys.u64s);
1200  	*sorted = *b->data;
1201  	sorted->keys.u64s = cpu_to_le16(u64s);
1202  	swap(sorted, b->data);
1203  	set_btree_bset(b, b->set, &b->data->keys);
1204  	b->nsets = 1;
1205  	b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
1206  
1207  	BUG_ON(b->nr.live_u64s != u64s);
1208  
1209  	btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
1210  
1211  	if (updated_range)
1212  		bch2_btree_node_drop_keys_outside_node(b);
1213  
1214  	i = &b->data->keys;
1215  	for (k = i->start; k != vstruct_last(i);) {
1216  		struct bkey tmp;
1217  		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
1218  
1219  		ret = bch2_bkey_val_validate(c, u.s_c, READ);
1220  		if (ret == -BCH_ERR_fsck_delete_bkey ||
1221  		    (bch2_inject_invalid_keys &&
1222  		     !bversion_cmp(u.k->bversion, MAX_VERSION))) {
1223  			btree_keys_account_key_drop(&b->nr, 0, k);
1224  
1225  			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
1226  			memmove_u64s_down(k, bkey_p_next(k),
1227  					  (u64 *) vstruct_end(i) - (u64 *) k);
1228  			set_btree_bset_end(b, b->set);
1229  			continue;
1230  		}
1231  		if (ret)
1232  			goto fsck_err;
1233  
1234  		if (u.k->type == KEY_TYPE_btree_ptr_v2) {
1235  			struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
1236  
1237  			bp.v->mem_ptr = 0;
1238  		}
1239  
1240  		k = bkey_p_next(k);
1241  	}
1242  
1243  	bch2_bset_build_aux_tree(b, b->set, false);
1244  
1245  	set_needs_whiteout(btree_bset_first(b), true);
1246  
1247  	btree_node_reset_sib_u64s(b);
1248  
1249  	rcu_read_lock();
1250  	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
1251  		struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
1252  
1253  		if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
1254  			set_btree_node_need_rewrite(b);
1255  	}
1256  	rcu_read_unlock();
1257  
1258  	if (!ptr_written)
1259  		set_btree_node_need_rewrite(b);
1260  out:
1261  	mempool_free(iter, &c->fill_iter);
1262  	printbuf_exit(&buf);
1263  	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
1264  	return retry_read;
1265  fsck_err:
1266  	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
1267  	    ret == -BCH_ERR_btree_node_read_err_must_retry) {
1268  		retry_read = 1;
1269  	} else {
1270  		set_btree_node_read_error(b);
1271  		bch2_btree_lost_data(c, b->c.btree_id);
1272  	}
1273  	goto out;
1274  }
1275  
btree_node_read_work(struct work_struct * work)1276  static void btree_node_read_work(struct work_struct *work)
1277  {
1278  	struct btree_read_bio *rb =
1279  		container_of(work, struct btree_read_bio, work);
1280  	struct bch_fs *c	= rb->c;
1281  	struct bch_dev *ca	= rb->have_ioref ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
1282  	struct btree *b		= rb->b;
1283  	struct bio *bio		= &rb->bio;
1284  	struct bch_io_failures failed = { .nr = 0 };
1285  	struct printbuf buf = PRINTBUF;
1286  	bool saw_error = false;
1287  	bool retry = false;
1288  	bool can_retry;
1289  
1290  	goto start;
1291  	while (1) {
1292  		retry = true;
1293  		bch_info(c, "retrying read");
1294  		ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
1295  		rb->have_ioref		= ca != NULL;
1296  		bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
1297  		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
1298  		bio->bi_iter.bi_size	= btree_buf_bytes(b);
1299  
1300  		if (rb->have_ioref) {
1301  			bio_set_dev(bio, ca->disk_sb.bdev);
1302  			submit_bio_wait(bio);
1303  		} else {
1304  			bio->bi_status = BLK_STS_REMOVED;
1305  		}
1306  start:
1307  		printbuf_reset(&buf);
1308  		bch2_btree_pos_to_text(&buf, c, b);
1309  		bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
1310  				   "btree read error %s for %s",
1311  				   bch2_blk_status_to_str(bio->bi_status), buf.buf);
1312  		if (rb->have_ioref)
1313  			percpu_ref_put(&ca->io_ref);
1314  		rb->have_ioref = false;
1315  
1316  		bch2_mark_io_failure(&failed, &rb->pick);
1317  
1318  		can_retry = bch2_bkey_pick_read_device(c,
1319  				bkey_i_to_s_c(&b->key),
1320  				&failed, &rb->pick) > 0;
1321  
1322  		if (!bio->bi_status &&
1323  		    !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
1324  			if (retry)
1325  				bch_info(c, "retry success");
1326  			break;
1327  		}
1328  
1329  		saw_error = true;
1330  
1331  		if (!can_retry) {
1332  			set_btree_node_read_error(b);
1333  			bch2_btree_lost_data(c, b->c.btree_id);
1334  			break;
1335  		}
1336  	}
1337  
1338  	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
1339  			       rb->start_time);
1340  	bio_put(&rb->bio);
1341  
1342  	if (saw_error &&
1343  	    !btree_node_read_error(b) &&
1344  	    c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
1345  		printbuf_reset(&buf);
1346  		bch2_bpos_to_text(&buf, b->key.k.p);
1347  		bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
1348  			 __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
1349  
1350  		bch2_btree_node_rewrite_async(c, b);
1351  	}
1352  
1353  	printbuf_exit(&buf);
1354  	clear_btree_node_read_in_flight(b);
1355  	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
1356  }
1357  
btree_node_read_endio(struct bio * bio)1358  static void btree_node_read_endio(struct bio *bio)
1359  {
1360  	struct btree_read_bio *rb =
1361  		container_of(bio, struct btree_read_bio, bio);
1362  	struct bch_fs *c	= rb->c;
1363  
1364  	if (rb->have_ioref) {
1365  		struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
1366  
1367  		bch2_latency_acct(ca, rb->start_time, READ);
1368  	}
1369  
1370  	queue_work(c->btree_read_complete_wq, &rb->work);
1371  }
1372  
1373  struct btree_node_read_all {
1374  	struct closure		cl;
1375  	struct bch_fs		*c;
1376  	struct btree		*b;
1377  	unsigned		nr;
1378  	void			*buf[BCH_REPLICAS_MAX];
1379  	struct bio		*bio[BCH_REPLICAS_MAX];
1380  	blk_status_t		err[BCH_REPLICAS_MAX];
1381  };
1382  
btree_node_sectors_written(struct bch_fs * c,void * data)1383  static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
1384  {
1385  	struct btree_node *bn = data;
1386  	struct btree_node_entry *bne;
1387  	unsigned offset = 0;
1388  
1389  	if (le64_to_cpu(bn->magic) !=  bset_magic(c))
1390  		return 0;
1391  
1392  	while (offset < btree_sectors(c)) {
1393  		if (!offset) {
1394  			offset += vstruct_sectors(bn, c->block_bits);
1395  		} else {
1396  			bne = data + (offset << 9);
1397  			if (bne->keys.seq != bn->keys.seq)
1398  				break;
1399  			offset += vstruct_sectors(bne, c->block_bits);
1400  		}
1401  	}
1402  
1403  	return offset;
1404  }
1405  
btree_node_has_extra_bsets(struct bch_fs * c,unsigned offset,void * data)1406  static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
1407  {
1408  	struct btree_node *bn = data;
1409  	struct btree_node_entry *bne;
1410  
1411  	if (!offset)
1412  		return false;
1413  
1414  	while (offset < btree_sectors(c)) {
1415  		bne = data + (offset << 9);
1416  		if (bne->keys.seq == bn->keys.seq)
1417  			return true;
1418  		offset++;
1419  	}
1420  
1421  	return false;
1422  	return offset;
1423  }
1424  
CLOSURE_CALLBACK(btree_node_read_all_replicas_done)1425  static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
1426  {
1427  	closure_type(ra, struct btree_node_read_all, cl);
1428  	struct bch_fs *c = ra->c;
1429  	struct btree *b = ra->b;
1430  	struct printbuf buf = PRINTBUF;
1431  	bool dump_bset_maps = false;
1432  	bool have_retry = false;
1433  	int ret = 0, best = -1, write = READ;
1434  	unsigned i, written = 0, written2 = 0;
1435  	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
1436  		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
1437  	bool _saw_error = false, *saw_error = &_saw_error;
1438  
1439  	for (i = 0; i < ra->nr; i++) {
1440  		struct btree_node *bn = ra->buf[i];
1441  
1442  		if (ra->err[i])
1443  			continue;
1444  
1445  		if (le64_to_cpu(bn->magic) != bset_magic(c) ||
1446  		    (seq && seq != bn->keys.seq))
1447  			continue;
1448  
1449  		if (best < 0) {
1450  			best = i;
1451  			written = btree_node_sectors_written(c, bn);
1452  			continue;
1453  		}
1454  
1455  		written2 = btree_node_sectors_written(c, ra->buf[i]);
1456  		if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
1457  				 c, NULL, b, NULL, NULL,
1458  				 btree_node_replicas_sectors_written_mismatch,
1459  				 "btree node sectors written mismatch: %u != %u",
1460  				 written, written2) ||
1461  		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
1462  				 -BCH_ERR_btree_node_read_err_fixable,
1463  				 c, NULL, b, NULL, NULL,
1464  				 btree_node_bset_after_end,
1465  				 "found bset signature after last bset") ||
1466  		    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
1467  				 -BCH_ERR_btree_node_read_err_fixable,
1468  				 c, NULL, b, NULL, NULL,
1469  				 btree_node_replicas_data_mismatch,
1470  				 "btree node replicas content mismatch"))
1471  			dump_bset_maps = true;
1472  
1473  		if (written2 > written) {
1474  			written = written2;
1475  			best = i;
1476  		}
1477  	}
1478  fsck_err:
1479  	if (dump_bset_maps) {
1480  		for (i = 0; i < ra->nr; i++) {
1481  			struct btree_node *bn = ra->buf[i];
1482  			struct btree_node_entry *bne = NULL;
1483  			unsigned offset = 0, sectors;
1484  			bool gap = false;
1485  
1486  			if (ra->err[i])
1487  				continue;
1488  
1489  			printbuf_reset(&buf);
1490  
1491  			while (offset < btree_sectors(c)) {
1492  				if (!offset) {
1493  					sectors = vstruct_sectors(bn, c->block_bits);
1494  				} else {
1495  					bne = ra->buf[i] + (offset << 9);
1496  					if (bne->keys.seq != bn->keys.seq)
1497  						break;
1498  					sectors = vstruct_sectors(bne, c->block_bits);
1499  				}
1500  
1501  				prt_printf(&buf, " %u-%u", offset, offset + sectors);
1502  				if (bne && bch2_journal_seq_is_blacklisted(c,
1503  							le64_to_cpu(bne->keys.journal_seq), false))
1504  					prt_printf(&buf, "*");
1505  				offset += sectors;
1506  			}
1507  
1508  			while (offset < btree_sectors(c)) {
1509  				bne = ra->buf[i] + (offset << 9);
1510  				if (bne->keys.seq == bn->keys.seq) {
1511  					if (!gap)
1512  						prt_printf(&buf, " GAP");
1513  					gap = true;
1514  
1515  					sectors = vstruct_sectors(bne, c->block_bits);
1516  					prt_printf(&buf, " %u-%u", offset, offset + sectors);
1517  					if (bch2_journal_seq_is_blacklisted(c,
1518  							le64_to_cpu(bne->keys.journal_seq), false))
1519  						prt_printf(&buf, "*");
1520  				}
1521  				offset++;
1522  			}
1523  
1524  			bch_err(c, "replica %u:%s", i, buf.buf);
1525  		}
1526  	}
1527  
1528  	if (best >= 0) {
1529  		memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
1530  		ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
1531  	} else {
1532  		ret = -1;
1533  	}
1534  
1535  	if (ret) {
1536  		set_btree_node_read_error(b);
1537  		bch2_btree_lost_data(c, b->c.btree_id);
1538  	} else if (*saw_error)
1539  		bch2_btree_node_rewrite_async(c, b);
1540  
1541  	for (i = 0; i < ra->nr; i++) {
1542  		mempool_free(ra->buf[i], &c->btree_bounce_pool);
1543  		bio_put(ra->bio[i]);
1544  	}
1545  
1546  	closure_debug_destroy(&ra->cl);
1547  	kfree(ra);
1548  	printbuf_exit(&buf);
1549  
1550  	clear_btree_node_read_in_flight(b);
1551  	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
1552  }
1553  
btree_node_read_all_replicas_endio(struct bio * bio)1554  static void btree_node_read_all_replicas_endio(struct bio *bio)
1555  {
1556  	struct btree_read_bio *rb =
1557  		container_of(bio, struct btree_read_bio, bio);
1558  	struct bch_fs *c	= rb->c;
1559  	struct btree_node_read_all *ra = rb->ra;
1560  
1561  	if (rb->have_ioref) {
1562  		struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
1563  
1564  		bch2_latency_acct(ca, rb->start_time, READ);
1565  	}
1566  
1567  	ra->err[rb->idx] = bio->bi_status;
1568  	closure_put(&ra->cl);
1569  }
1570  
1571  /*
1572   * XXX This allocates multiple times from the same mempools, and can deadlock
1573   * under sufficient memory pressure (but is only a debug path)
1574   */
btree_node_read_all_replicas(struct bch_fs * c,struct btree * b,bool sync)1575  static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
1576  {
1577  	struct bkey_s_c k = bkey_i_to_s_c(&b->key);
1578  	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1579  	const union bch_extent_entry *entry;
1580  	struct extent_ptr_decoded pick;
1581  	struct btree_node_read_all *ra;
1582  	unsigned i;
1583  
1584  	ra = kzalloc(sizeof(*ra), GFP_NOFS);
1585  	if (!ra)
1586  		return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
1587  
1588  	closure_init(&ra->cl, NULL);
1589  	ra->c	= c;
1590  	ra->b	= b;
1591  	ra->nr	= bch2_bkey_nr_ptrs(k);
1592  
1593  	for (i = 0; i < ra->nr; i++) {
1594  		ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
1595  		ra->bio[i] = bio_alloc_bioset(NULL,
1596  					      buf_pages(ra->buf[i], btree_buf_bytes(b)),
1597  					      REQ_OP_READ|REQ_SYNC|REQ_META,
1598  					      GFP_NOFS,
1599  					      &c->btree_bio);
1600  	}
1601  
1602  	i = 0;
1603  	bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
1604  		struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
1605  		struct btree_read_bio *rb =
1606  			container_of(ra->bio[i], struct btree_read_bio, bio);
1607  		rb->c			= c;
1608  		rb->b			= b;
1609  		rb->ra			= ra;
1610  		rb->start_time		= local_clock();
1611  		rb->have_ioref		= ca != NULL;
1612  		rb->idx			= i;
1613  		rb->pick		= pick;
1614  		rb->bio.bi_iter.bi_sector = pick.ptr.offset;
1615  		rb->bio.bi_end_io	= btree_node_read_all_replicas_endio;
1616  		bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
1617  
1618  		if (rb->have_ioref) {
1619  			this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
1620  				     bio_sectors(&rb->bio));
1621  			bio_set_dev(&rb->bio, ca->disk_sb.bdev);
1622  
1623  			closure_get(&ra->cl);
1624  			submit_bio(&rb->bio);
1625  		} else {
1626  			ra->err[i] = BLK_STS_REMOVED;
1627  		}
1628  
1629  		i++;
1630  	}
1631  
1632  	if (sync) {
1633  		closure_sync(&ra->cl);
1634  		btree_node_read_all_replicas_done(&ra->cl.work);
1635  	} else {
1636  		continue_at(&ra->cl, btree_node_read_all_replicas_done,
1637  			    c->btree_read_complete_wq);
1638  	}
1639  
1640  	return 0;
1641  }
1642  
bch2_btree_node_read(struct btree_trans * trans,struct btree * b,bool sync)1643  void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
1644  			  bool sync)
1645  {
1646  	struct bch_fs *c = trans->c;
1647  	struct extent_ptr_decoded pick;
1648  	struct btree_read_bio *rb;
1649  	struct bch_dev *ca;
1650  	struct bio *bio;
1651  	int ret;
1652  
1653  	trace_and_count(c, btree_node_read, trans, b);
1654  
1655  	if (bch2_verify_all_btree_replicas &&
1656  	    !btree_node_read_all_replicas(c, b, sync))
1657  		return;
1658  
1659  	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
1660  					 NULL, &pick);
1661  
1662  	if (ret <= 0) {
1663  		struct printbuf buf = PRINTBUF;
1664  
1665  		prt_str(&buf, "btree node read error: no device to read from\n at ");
1666  		bch2_btree_pos_to_text(&buf, c, b);
1667  		bch_err_ratelimited(c, "%s", buf.buf);
1668  
1669  		if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
1670  		    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
1671  			bch2_fatal_error(c);
1672  
1673  		set_btree_node_read_error(b);
1674  		bch2_btree_lost_data(c, b->c.btree_id);
1675  		clear_btree_node_read_in_flight(b);
1676  		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
1677  		printbuf_exit(&buf);
1678  		return;
1679  	}
1680  
1681  	ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
1682  
1683  	bio = bio_alloc_bioset(NULL,
1684  			       buf_pages(b->data, btree_buf_bytes(b)),
1685  			       REQ_OP_READ|REQ_SYNC|REQ_META,
1686  			       GFP_NOFS,
1687  			       &c->btree_bio);
1688  	rb = container_of(bio, struct btree_read_bio, bio);
1689  	rb->c			= c;
1690  	rb->b			= b;
1691  	rb->ra			= NULL;
1692  	rb->start_time		= local_clock();
1693  	rb->have_ioref		= ca != NULL;
1694  	rb->pick		= pick;
1695  	INIT_WORK(&rb->work, btree_node_read_work);
1696  	bio->bi_iter.bi_sector	= pick.ptr.offset;
1697  	bio->bi_end_io		= btree_node_read_endio;
1698  	bch2_bio_map(bio, b->data, btree_buf_bytes(b));
1699  
1700  	if (rb->have_ioref) {
1701  		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
1702  			     bio_sectors(bio));
1703  		bio_set_dev(bio, ca->disk_sb.bdev);
1704  
1705  		if (sync) {
1706  			submit_bio_wait(bio);
1707  			bch2_latency_acct(ca, rb->start_time, READ);
1708  			btree_node_read_work(&rb->work);
1709  		} else {
1710  			submit_bio(bio);
1711  		}
1712  	} else {
1713  		bio->bi_status = BLK_STS_REMOVED;
1714  
1715  		if (sync)
1716  			btree_node_read_work(&rb->work);
1717  		else
1718  			queue_work(c->btree_read_complete_wq, &rb->work);
1719  	}
1720  }
1721  
__bch2_btree_root_read(struct btree_trans * trans,enum btree_id id,const struct bkey_i * k,unsigned level)1722  static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
1723  				  const struct bkey_i *k, unsigned level)
1724  {
1725  	struct bch_fs *c = trans->c;
1726  	struct closure cl;
1727  	struct btree *b;
1728  	int ret;
1729  
1730  	closure_init_stack(&cl);
1731  
1732  	do {
1733  		ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
1734  		closure_sync(&cl);
1735  	} while (ret);
1736  
1737  	b = bch2_btree_node_mem_alloc(trans, level != 0);
1738  	bch2_btree_cache_cannibalize_unlock(trans);
1739  
1740  	BUG_ON(IS_ERR(b));
1741  
1742  	bkey_copy(&b->key, k);
1743  	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
1744  
1745  	set_btree_node_read_in_flight(b);
1746  
1747  	/* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
1748  	bch2_trans_unlock(trans);
1749  	bch2_btree_node_read(trans, b, true);
1750  
1751  	if (btree_node_read_error(b)) {
1752  		mutex_lock(&c->btree_cache.lock);
1753  		bch2_btree_node_hash_remove(&c->btree_cache, b);
1754  		mutex_unlock(&c->btree_cache.lock);
1755  
1756  		ret = -BCH_ERR_btree_node_read_error;
1757  		goto err;
1758  	}
1759  
1760  	bch2_btree_set_root_for_read(c, b);
1761  err:
1762  	six_unlock_write(&b->c.lock);
1763  	six_unlock_intent(&b->c.lock);
1764  
1765  	return ret;
1766  }
1767  
bch2_btree_root_read(struct bch_fs * c,enum btree_id id,const struct bkey_i * k,unsigned level)1768  int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
1769  			const struct bkey_i *k, unsigned level)
1770  {
1771  	return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
1772  }
1773  
bch2_btree_complete_write(struct bch_fs * c,struct btree * b,struct btree_write * w)1774  static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
1775  				      struct btree_write *w)
1776  {
1777  	unsigned long old, new;
1778  
1779  	old = READ_ONCE(b->will_make_reachable);
1780  	do {
1781  		new = old;
1782  		if (!(old & 1))
1783  			break;
1784  
1785  		new &= ~1UL;
1786  	} while (!try_cmpxchg(&b->will_make_reachable, &old, new));
1787  
1788  	if (old & 1)
1789  		closure_put(&((struct btree_update *) new)->cl);
1790  
1791  	bch2_journal_pin_drop(&c->journal, &w->journal);
1792  }
1793  
__btree_node_write_done(struct bch_fs * c,struct btree * b)1794  static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
1795  {
1796  	struct btree_write *w = btree_prev_write(b);
1797  	unsigned long old, new;
1798  	unsigned type = 0;
1799  
1800  	bch2_btree_complete_write(c, b, w);
1801  
1802  	old = READ_ONCE(b->flags);
1803  	do {
1804  		new = old;
1805  
1806  		if ((old & (1U << BTREE_NODE_dirty)) &&
1807  		    (old & (1U << BTREE_NODE_need_write)) &&
1808  		    !(old & (1U << BTREE_NODE_never_write)) &&
1809  		    !(old & (1U << BTREE_NODE_write_blocked)) &&
1810  		    !(old & (1U << BTREE_NODE_will_make_reachable))) {
1811  			new &= ~(1U << BTREE_NODE_dirty);
1812  			new &= ~(1U << BTREE_NODE_need_write);
1813  			new |=  (1U << BTREE_NODE_write_in_flight);
1814  			new |=  (1U << BTREE_NODE_write_in_flight_inner);
1815  			new |=  (1U << BTREE_NODE_just_written);
1816  			new ^=  (1U << BTREE_NODE_write_idx);
1817  
1818  			type = new & BTREE_WRITE_TYPE_MASK;
1819  			new &= ~BTREE_WRITE_TYPE_MASK;
1820  		} else {
1821  			new &= ~(1U << BTREE_NODE_write_in_flight);
1822  			new &= ~(1U << BTREE_NODE_write_in_flight_inner);
1823  		}
1824  	} while (!try_cmpxchg(&b->flags, &old, new));
1825  
1826  	if (new & (1U << BTREE_NODE_write_in_flight))
1827  		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
1828  	else
1829  		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
1830  }
1831  
btree_node_write_done(struct bch_fs * c,struct btree * b)1832  static void btree_node_write_done(struct bch_fs *c, struct btree *b)
1833  {
1834  	struct btree_trans *trans = bch2_trans_get(c);
1835  
1836  	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
1837  
1838  	/* we don't need transaction context anymore after we got the lock. */
1839  	bch2_trans_put(trans);
1840  	__btree_node_write_done(c, b);
1841  	six_unlock_read(&b->c.lock);
1842  }
1843  
btree_node_write_work(struct work_struct * work)1844  static void btree_node_write_work(struct work_struct *work)
1845  {
1846  	struct btree_write_bio *wbio =
1847  		container_of(work, struct btree_write_bio, work);
1848  	struct bch_fs *c	= wbio->wbio.c;
1849  	struct btree *b		= wbio->wbio.bio.bi_private;
1850  	int ret = 0;
1851  
1852  	btree_bounce_free(c,
1853  		wbio->data_bytes,
1854  		wbio->wbio.used_mempool,
1855  		wbio->data);
1856  
1857  	bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
1858  		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
1859  
1860  	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
1861  		ret = -BCH_ERR_btree_node_write_all_failed;
1862  		goto err;
1863  	}
1864  
1865  	if (wbio->wbio.first_btree_write) {
1866  		if (wbio->wbio.failed.nr) {
1867  
1868  		}
1869  	} else {
1870  		ret = bch2_trans_do(c,
1871  			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
1872  					BCH_WATERMARK_interior_updates|
1873  					BCH_TRANS_COMMIT_journal_reclaim|
1874  					BCH_TRANS_COMMIT_no_enospc|
1875  					BCH_TRANS_COMMIT_no_check_rw,
1876  					!wbio->wbio.failed.nr));
1877  		if (ret)
1878  			goto err;
1879  	}
1880  out:
1881  	bio_put(&wbio->wbio.bio);
1882  	btree_node_write_done(c, b);
1883  	return;
1884  err:
1885  	set_btree_node_noevict(b);
1886  	bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
1887  			     "writing btree node: %s", bch2_err_str(ret));
1888  	goto out;
1889  }
1890  
btree_node_write_endio(struct bio * bio)1891  static void btree_node_write_endio(struct bio *bio)
1892  {
1893  	struct bch_write_bio *wbio	= to_wbio(bio);
1894  	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
1895  	struct bch_write_bio *orig	= parent ?: wbio;
1896  	struct btree_write_bio *wb	= container_of(orig, struct btree_write_bio, wbio);
1897  	struct bch_fs *c		= wbio->c;
1898  	struct btree *b			= wbio->bio.bi_private;
1899  	struct bch_dev *ca		= wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
1900  	unsigned long flags;
1901  
1902  	if (wbio->have_ioref)
1903  		bch2_latency_acct(ca, wbio->submit_time, WRITE);
1904  
1905  	if (!ca ||
1906  	    bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
1907  			       "btree write error: %s",
1908  			       bch2_blk_status_to_str(bio->bi_status)) ||
1909  	    bch2_meta_write_fault("btree")) {
1910  		spin_lock_irqsave(&c->btree_write_error_lock, flags);
1911  		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
1912  		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
1913  	}
1914  
1915  	if (wbio->have_ioref)
1916  		percpu_ref_put(&ca->io_ref);
1917  
1918  	if (parent) {
1919  		bio_put(bio);
1920  		bio_endio(&parent->bio);
1921  		return;
1922  	}
1923  
1924  	clear_btree_node_write_in_flight_inner(b);
1925  	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
1926  	INIT_WORK(&wb->work, btree_node_write_work);
1927  	queue_work(c->btree_io_complete_wq, &wb->work);
1928  }
1929  
validate_bset_for_write(struct bch_fs * c,struct btree * b,struct bset * i,unsigned sectors)1930  static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
1931  				   struct bset *i, unsigned sectors)
1932  {
1933  	bool saw_error;
1934  
1935  	int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
1936  				     BKEY_TYPE_btree, WRITE);
1937  	if (ret) {
1938  		bch2_fs_inconsistent(c, "invalid btree node key before write");
1939  		return ret;
1940  	}
1941  
1942  	ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
1943  		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
1944  	if (ret) {
1945  		bch2_inconsistent_error(c);
1946  		dump_stack();
1947  	}
1948  
1949  	return ret;
1950  }
1951  
btree_write_submit(struct work_struct * work)1952  static void btree_write_submit(struct work_struct *work)
1953  {
1954  	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
1955  	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
1956  
1957  	bkey_copy(&tmp.k, &wbio->key);
1958  
1959  	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
1960  		ptr->offset += wbio->sector_offset;
1961  
1962  	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
1963  				  &tmp.k, false);
1964  }
1965  
__bch2_btree_node_write(struct bch_fs * c,struct btree * b,unsigned flags)1966  void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
1967  {
1968  	struct btree_write_bio *wbio;
1969  	struct bset *i;
1970  	struct btree_node *bn = NULL;
1971  	struct btree_node_entry *bne = NULL;
1972  	struct sort_iter_stack sort_iter;
1973  	struct nonce nonce;
1974  	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
1975  	u64 seq = 0;
1976  	bool used_mempool;
1977  	unsigned long old, new;
1978  	bool validate_before_checksum = false;
1979  	enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
1980  	void *data;
1981  	int ret;
1982  
1983  	if (flags & BTREE_WRITE_ALREADY_STARTED)
1984  		goto do_write;
1985  
1986  	/*
1987  	 * We may only have a read lock on the btree node - the dirty bit is our
1988  	 * "lock" against racing with other threads that may be trying to start
1989  	 * a write, we do a write iff we clear the dirty bit. Since setting the
1990  	 * dirty bit requires a write lock, we can't race with other threads
1991  	 * redirtying it:
1992  	 */
1993  	old = READ_ONCE(b->flags);
1994  	do {
1995  		new = old;
1996  
1997  		if (!(old & (1 << BTREE_NODE_dirty)))
1998  			return;
1999  
2000  		if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
2001  		    !(old & (1 << BTREE_NODE_need_write)))
2002  			return;
2003  
2004  		if (old &
2005  		    ((1 << BTREE_NODE_never_write)|
2006  		     (1 << BTREE_NODE_write_blocked)))
2007  			return;
2008  
2009  		if (b->written &&
2010  		    (old & (1 << BTREE_NODE_will_make_reachable)))
2011  			return;
2012  
2013  		if (old & (1 << BTREE_NODE_write_in_flight))
2014  			return;
2015  
2016  		if (flags & BTREE_WRITE_ONLY_IF_NEED)
2017  			type = new & BTREE_WRITE_TYPE_MASK;
2018  		new &= ~BTREE_WRITE_TYPE_MASK;
2019  
2020  		new &= ~(1 << BTREE_NODE_dirty);
2021  		new &= ~(1 << BTREE_NODE_need_write);
2022  		new |=  (1 << BTREE_NODE_write_in_flight);
2023  		new |=  (1 << BTREE_NODE_write_in_flight_inner);
2024  		new |=  (1 << BTREE_NODE_just_written);
2025  		new ^=  (1 << BTREE_NODE_write_idx);
2026  	} while (!try_cmpxchg_acquire(&b->flags, &old, new));
2027  
2028  	if (new & (1U << BTREE_NODE_need_write))
2029  		return;
2030  do_write:
2031  	BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
2032  
2033  	atomic_long_dec(&c->btree_cache.nr_dirty);
2034  
2035  	BUG_ON(btree_node_fake(b));
2036  	BUG_ON((b->will_make_reachable != 0) != !b->written);
2037  
2038  	BUG_ON(b->written >= btree_sectors(c));
2039  	BUG_ON(b->written & (block_sectors(c) - 1));
2040  	BUG_ON(bset_written(b, btree_bset_last(b)));
2041  	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
2042  	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
2043  
2044  	bch2_sort_whiteouts(c, b);
2045  
2046  	sort_iter_stack_init(&sort_iter, b);
2047  
2048  	bytes = !b->written
2049  		? sizeof(struct btree_node)
2050  		: sizeof(struct btree_node_entry);
2051  
2052  	bytes += b->whiteout_u64s * sizeof(u64);
2053  
2054  	for_each_bset(b, t) {
2055  		i = bset(b, t);
2056  
2057  		if (bset_written(b, i))
2058  			continue;
2059  
2060  		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
2061  		sort_iter_add(&sort_iter.iter,
2062  			      btree_bkey_first(b, t),
2063  			      btree_bkey_last(b, t));
2064  		seq = max(seq, le64_to_cpu(i->journal_seq));
2065  	}
2066  
2067  	BUG_ON(b->written && !seq);
2068  
2069  	/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
2070  	bytes += 8;
2071  
2072  	/* buffer must be a multiple of the block size */
2073  	bytes = round_up(bytes, block_bytes(c));
2074  
2075  	data = btree_bounce_alloc(c, bytes, &used_mempool);
2076  
2077  	if (!b->written) {
2078  		bn = data;
2079  		*bn = *b->data;
2080  		i = &bn->keys;
2081  	} else {
2082  		bne = data;
2083  		bne->keys = b->data->keys;
2084  		i = &bne->keys;
2085  	}
2086  
2087  	i->journal_seq	= cpu_to_le64(seq);
2088  	i->u64s		= 0;
2089  
2090  	sort_iter_add(&sort_iter.iter,
2091  		      unwritten_whiteouts_start(b),
2092  		      unwritten_whiteouts_end(b));
2093  	SET_BSET_SEPARATE_WHITEOUTS(i, false);
2094  
2095  	u64s = bch2_sort_keys_keep_unwritten_whiteouts(i->start, &sort_iter.iter);
2096  	le16_add_cpu(&i->u64s, u64s);
2097  
2098  	b->whiteout_u64s = 0;
2099  
2100  	BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
2101  
2102  	set_needs_whiteout(i, false);
2103  
2104  	/* do we have data to write? */
2105  	if (b->written && !i->u64s)
2106  		goto nowrite;
2107  
2108  	bytes_to_write = vstruct_end(i) - data;
2109  	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
2110  
2111  	if (!b->written &&
2112  	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
2113  		BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
2114  
2115  	memset(data + bytes_to_write, 0,
2116  	       (sectors_to_write << 9) - bytes_to_write);
2117  
2118  	BUG_ON(b->written + sectors_to_write > btree_sectors(c));
2119  	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
2120  	BUG_ON(i->seq != b->data->keys.seq);
2121  
2122  	i->version = cpu_to_le16(c->sb.version);
2123  	SET_BSET_OFFSET(i, b->written);
2124  	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
2125  
2126  	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
2127  		validate_before_checksum = true;
2128  
2129  	/* validate_bset will be modifying: */
2130  	if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
2131  		validate_before_checksum = true;
2132  
2133  	/* if we're going to be encrypting, check metadata validity first: */
2134  	if (validate_before_checksum &&
2135  	    validate_bset_for_write(c, b, i, sectors_to_write))
2136  		goto err;
2137  
2138  	ret = bset_encrypt(c, i, b->written << 9);
2139  	if (bch2_fs_fatal_err_on(ret, c,
2140  			"encrypting btree node: %s", bch2_err_str(ret)))
2141  		goto err;
2142  
2143  	nonce = btree_nonce(i, b->written << 9);
2144  
2145  	if (bn)
2146  		bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
2147  	else
2148  		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
2149  
2150  	/* if we're not encrypting, check metadata after checksumming: */
2151  	if (!validate_before_checksum &&
2152  	    validate_bset_for_write(c, b, i, sectors_to_write))
2153  		goto err;
2154  
2155  	/*
2156  	 * We handle btree write errors by immediately halting the journal -
2157  	 * after we've done that, we can't issue any subsequent btree writes
2158  	 * because they might have pointers to new nodes that failed to write.
2159  	 *
2160  	 * Furthermore, there's no point in doing any more btree writes because
2161  	 * with the journal stopped, we're never going to update the journal to
2162  	 * reflect that those writes were done and the data flushed from the
2163  	 * journal:
2164  	 *
2165  	 * Also on journal error, the pending write may have updates that were
2166  	 * never journalled (interior nodes, see btree_update_nodes_written()) -
2167  	 * it's critical that we don't do the write in that case otherwise we
2168  	 * will have updates visible that weren't in the journal:
2169  	 *
2170  	 * Make sure to update b->written so bch2_btree_init_next() doesn't
2171  	 * break:
2172  	 */
2173  	if (bch2_journal_error(&c->journal) ||
2174  	    c->opts.nochanges)
2175  		goto err;
2176  
2177  	trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
2178  
2179  	wbio = container_of(bio_alloc_bioset(NULL,
2180  				buf_pages(data, sectors_to_write << 9),
2181  				REQ_OP_WRITE|REQ_META,
2182  				GFP_NOFS,
2183  				&c->btree_bio),
2184  			    struct btree_write_bio, wbio.bio);
2185  	wbio_init(&wbio->wbio.bio);
2186  	wbio->data			= data;
2187  	wbio->data_bytes		= bytes;
2188  	wbio->sector_offset		= b->written;
2189  	wbio->wbio.c			= c;
2190  	wbio->wbio.used_mempool		= used_mempool;
2191  	wbio->wbio.first_btree_write	= !b->written;
2192  	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
2193  	wbio->wbio.bio.bi_private	= b;
2194  
2195  	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
2196  
2197  	bkey_copy(&wbio->key, &b->key);
2198  
2199  	b->written += sectors_to_write;
2200  
2201  	if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
2202  		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
2203  			cpu_to_le16(b->written);
2204  
2205  	atomic64_inc(&c->btree_write_stats[type].nr);
2206  	atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
2207  
2208  	INIT_WORK(&wbio->work, btree_write_submit);
2209  	queue_work(c->btree_write_submit_wq, &wbio->work);
2210  	return;
2211  err:
2212  	set_btree_node_noevict(b);
2213  	b->written += sectors_to_write;
2214  nowrite:
2215  	btree_bounce_free(c, bytes, used_mempool, data);
2216  	__btree_node_write_done(c, b);
2217  }
2218  
2219  /*
2220   * Work that must be done with write lock held:
2221   */
bch2_btree_post_write_cleanup(struct bch_fs * c,struct btree * b)2222  bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
2223  {
2224  	bool invalidated_iter = false;
2225  	struct btree_node_entry *bne;
2226  
2227  	if (!btree_node_just_written(b))
2228  		return false;
2229  
2230  	BUG_ON(b->whiteout_u64s);
2231  
2232  	clear_btree_node_just_written(b);
2233  
2234  	/*
2235  	 * Note: immediately after write, bset_written() doesn't work - the
2236  	 * amount of data we had to write after compaction might have been
2237  	 * smaller than the offset of the last bset.
2238  	 *
2239  	 * However, we know that all bsets have been written here, as long as
2240  	 * we're still holding the write lock:
2241  	 */
2242  
2243  	/*
2244  	 * XXX: decide if we really want to unconditionally sort down to a
2245  	 * single bset:
2246  	 */
2247  	if (b->nsets > 1) {
2248  		btree_node_sort(c, b, 0, b->nsets);
2249  		invalidated_iter = true;
2250  	} else {
2251  		invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
2252  	}
2253  
2254  	for_each_bset(b, t)
2255  		set_needs_whiteout(bset(b, t), true);
2256  
2257  	bch2_btree_verify(c, b);
2258  
2259  	/*
2260  	 * If later we don't unconditionally sort down to a single bset, we have
2261  	 * to ensure this is still true:
2262  	 */
2263  	BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
2264  
2265  	bne = want_new_bset(c, b);
2266  	if (bne)
2267  		bch2_bset_init_next(b, bne);
2268  
2269  	bch2_btree_build_aux_trees(b);
2270  
2271  	return invalidated_iter;
2272  }
2273  
2274  /*
2275   * Use this one if the node is intent locked:
2276   */
bch2_btree_node_write(struct bch_fs * c,struct btree * b,enum six_lock_type lock_type_held,unsigned flags)2277  void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
2278  			   enum six_lock_type lock_type_held,
2279  			   unsigned flags)
2280  {
2281  	if (lock_type_held == SIX_LOCK_intent ||
2282  	    (lock_type_held == SIX_LOCK_read &&
2283  	     six_lock_tryupgrade(&b->c.lock))) {
2284  		__bch2_btree_node_write(c, b, flags);
2285  
2286  		/* don't cycle lock unnecessarily: */
2287  		if (btree_node_just_written(b) &&
2288  		    six_trylock_write(&b->c.lock)) {
2289  			bch2_btree_post_write_cleanup(c, b);
2290  			six_unlock_write(&b->c.lock);
2291  		}
2292  
2293  		if (lock_type_held == SIX_LOCK_read)
2294  			six_lock_downgrade(&b->c.lock);
2295  	} else {
2296  		__bch2_btree_node_write(c, b, flags);
2297  		if (lock_type_held == SIX_LOCK_write &&
2298  		    btree_node_just_written(b))
2299  			bch2_btree_post_write_cleanup(c, b);
2300  	}
2301  }
2302  
__bch2_btree_flush_all(struct bch_fs * c,unsigned flag)2303  static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
2304  {
2305  	struct bucket_table *tbl;
2306  	struct rhash_head *pos;
2307  	struct btree *b;
2308  	unsigned i;
2309  	bool ret = false;
2310  restart:
2311  	rcu_read_lock();
2312  	for_each_cached_btree(b, c, tbl, i, pos)
2313  		if (test_bit(flag, &b->flags)) {
2314  			rcu_read_unlock();
2315  			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
2316  			ret = true;
2317  			goto restart;
2318  		}
2319  	rcu_read_unlock();
2320  
2321  	return ret;
2322  }
2323  
bch2_btree_flush_all_reads(struct bch_fs * c)2324  bool bch2_btree_flush_all_reads(struct bch_fs *c)
2325  {
2326  	return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
2327  }
2328  
bch2_btree_flush_all_writes(struct bch_fs * c)2329  bool bch2_btree_flush_all_writes(struct bch_fs *c)
2330  {
2331  	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
2332  }
2333  
2334  static const char * const bch2_btree_write_types[] = {
2335  #define x(t, n) [n] = #t,
2336  	BCH_BTREE_WRITE_TYPES()
2337  	NULL
2338  };
2339  
bch2_btree_write_stats_to_text(struct printbuf * out,struct bch_fs * c)2340  void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
2341  {
2342  	printbuf_tabstop_push(out, 20);
2343  	printbuf_tabstop_push(out, 10);
2344  
2345  	prt_printf(out, "\tnr\tsize\n");
2346  
2347  	for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
2348  		u64 nr		= atomic64_read(&c->btree_write_stats[i].nr);
2349  		u64 bytes	= atomic64_read(&c->btree_write_stats[i].bytes);
2350  
2351  		prt_printf(out, "%s:\t%llu\t", bch2_btree_write_types[i], nr);
2352  		prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
2353  		prt_newline(out);
2354  	}
2355  }
2356