1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "btree_cache.h"
5 #include "btree_io.h"
6 #include "btree_journal_iter.h"
7 #include "btree_node_scan.h"
8 #include "btree_update_interior.h"
9 #include "buckets.h"
10 #include "error.h"
11 #include "journal_io.h"
12 #include "recovery_passes.h"
13 
14 #include <linux/kthread.h>
15 #include <linux/sort.h>
16 
17 struct find_btree_nodes_worker {
18 	struct closure		*cl;
19 	struct find_btree_nodes	*f;
20 	struct bch_dev		*ca;
21 };
22 
found_btree_node_to_text(struct printbuf * out,struct bch_fs * c,const struct found_btree_node * n)23 static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
24 {
25 	prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ",
26 		   bch2_btree_id_str(n->btree_id), n->level, n->seq,
27 		   n->journal_seq, n->cookie);
28 	bch2_bpos_to_text(out, n->min_key);
29 	prt_str(out, "-");
30 	bch2_bpos_to_text(out, n->max_key);
31 
32 	if (n->range_updated)
33 		prt_str(out, " range updated");
34 	if (n->overwritten)
35 		prt_str(out, " overwritten");
36 
37 	for (unsigned i = 0; i < n->nr_ptrs; i++) {
38 		prt_char(out, ' ');
39 		bch2_extent_ptr_to_text(out, c, n->ptrs + i);
40 	}
41 }
42 
found_btree_nodes_to_text(struct printbuf * out,struct bch_fs * c,found_btree_nodes nodes)43 static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
44 {
45 	printbuf_indent_add(out, 2);
46 	darray_for_each(nodes, i) {
47 		found_btree_node_to_text(out, c, i);
48 		prt_newline(out);
49 	}
50 	printbuf_indent_sub(out, 2);
51 }
52 
found_btree_node_to_key(struct bkey_i * k,const struct found_btree_node * f)53 static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
54 {
55 	struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
56 
57 	set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
58 	bp->k.p			= f->max_key;
59 	bp->v.seq		= cpu_to_le64(f->cookie);
60 	bp->v.sectors_written	= 0;
61 	bp->v.flags		= 0;
62 	bp->v.sectors_written	= cpu_to_le16(f->sectors_written);
63 	bp->v.min_key		= f->min_key;
64 	SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
65 	memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
66 }
67 
bkey_journal_seq(struct bkey_s_c k)68 static inline u64 bkey_journal_seq(struct bkey_s_c k)
69 {
70 	switch (k.k->type) {
71 	case KEY_TYPE_inode_v3:
72 		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
73 	default:
74 		return 0;
75 	}
76 }
77 
found_btree_node_is_readable(struct btree_trans * trans,struct found_btree_node * f)78 static bool found_btree_node_is_readable(struct btree_trans *trans,
79 					 struct found_btree_node *f)
80 {
81 	struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
82 
83 	found_btree_node_to_key(&tmp.k, f);
84 
85 	struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
86 	bool ret = !IS_ERR_OR_NULL(b);
87 	if (!ret)
88 		return ret;
89 
90 	f->sectors_written = b->written;
91 	f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
92 
93 	struct bkey_s_c k;
94 	struct bkey unpacked;
95 	struct btree_node_iter iter;
96 	for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
97 		f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
98 
99 	six_unlock_read(&b->c.lock);
100 
101 	/*
102 	 * We might update this node's range; if that happens, we need the node
103 	 * to be re-read so the read path can trim keys that are no longer in
104 	 * this node
105 	 */
106 	if (b != btree_node_root(trans->c, b))
107 		bch2_btree_node_evict(trans, &tmp.k);
108 	return ret;
109 }
110 
found_btree_node_cmp_cookie(const void * _l,const void * _r)111 static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
112 {
113 	const struct found_btree_node *l = _l;
114 	const struct found_btree_node *r = _r;
115 
116 	return  cmp_int(l->btree_id,	r->btree_id) ?:
117 		cmp_int(l->level,	r->level) ?:
118 		cmp_int(l->cookie,	r->cookie);
119 }
120 
121 /*
122  * Given two found btree nodes, if their sequence numbers are equal, take the
123  * one that's readable:
124  */
found_btree_node_cmp_time(const struct found_btree_node * l,const struct found_btree_node * r)125 static int found_btree_node_cmp_time(const struct found_btree_node *l,
126 				     const struct found_btree_node *r)
127 {
128 	return  cmp_int(l->seq, r->seq) ?:
129 		cmp_int(l->journal_seq, r->journal_seq);
130 }
131 
found_btree_node_cmp_pos(const void * _l,const void * _r)132 static int found_btree_node_cmp_pos(const void *_l, const void *_r)
133 {
134 	const struct found_btree_node *l = _l;
135 	const struct found_btree_node *r = _r;
136 
137 	return  cmp_int(l->btree_id,	r->btree_id) ?:
138 	       -cmp_int(l->level,	r->level) ?:
139 		bpos_cmp(l->min_key,	r->min_key) ?:
140 	       -found_btree_node_cmp_time(l, r);
141 }
142 
try_read_btree_node(struct find_btree_nodes * f,struct bch_dev * ca,struct bio * bio,struct btree_node * bn,u64 offset)143 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
144 				struct bio *bio, struct btree_node *bn, u64 offset)
145 {
146 	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
147 
148 	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
149 	bio->bi_iter.bi_sector	= offset;
150 	bch2_bio_map(bio, bn, PAGE_SIZE);
151 
152 	submit_bio_wait(bio);
153 	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
154 			       "IO error in try_read_btree_node() at %llu: %s",
155 			       offset, bch2_blk_status_to_str(bio->bi_status)))
156 		return;
157 
158 	if (le64_to_cpu(bn->magic) != bset_magic(c))
159 		return;
160 
161 	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
162 		struct nonce nonce = btree_nonce(&bn->keys, 0);
163 		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
164 
165 		bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
166 	}
167 
168 	if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
169 		return;
170 
171 	if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
172 		return;
173 
174 	if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
175 		return;
176 
177 	rcu_read_lock();
178 	struct found_btree_node n = {
179 		.btree_id	= BTREE_NODE_ID(bn),
180 		.level		= BTREE_NODE_LEVEL(bn),
181 		.seq		= BTREE_NODE_SEQ(bn),
182 		.cookie		= le64_to_cpu(bn->keys.seq),
183 		.min_key	= bn->min_key,
184 		.max_key	= bn->max_key,
185 		.nr_ptrs	= 1,
186 		.ptrs[0].type	= 1 << BCH_EXTENT_ENTRY_ptr,
187 		.ptrs[0].offset	= offset,
188 		.ptrs[0].dev	= ca->dev_idx,
189 		.ptrs[0].gen	= bucket_gen_get(ca, sector_to_bucket(ca, offset)),
190 	};
191 	rcu_read_unlock();
192 
193 	if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
194 		mutex_lock(&f->lock);
195 		if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
196 			bch_err(c, "try_read_btree_node() can't handle endian conversion");
197 			f->ret = -EINVAL;
198 			goto unlock;
199 		}
200 
201 		if (darray_push(&f->nodes, n))
202 			f->ret = -ENOMEM;
203 unlock:
204 		mutex_unlock(&f->lock);
205 	}
206 }
207 
read_btree_nodes_worker(void * p)208 static int read_btree_nodes_worker(void *p)
209 {
210 	struct find_btree_nodes_worker *w = p;
211 	struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
212 	struct bch_dev *ca = w->ca;
213 	void *buf = (void *) __get_free_page(GFP_KERNEL);
214 	struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
215 	unsigned long last_print = jiffies;
216 
217 	if (!buf || !bio) {
218 		bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
219 		w->f->ret = -ENOMEM;
220 		goto err;
221 	}
222 
223 	for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
224 		for (unsigned bucket_offset = 0;
225 		     bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
226 		     bucket_offset += btree_sectors(c)) {
227 			if (time_after(jiffies, last_print + HZ * 30)) {
228 				u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
229 				u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
230 
231 				bch_info(ca, "%s: %2u%% done", __func__,
232 					 (unsigned) div64_u64(cur_sector * 100, end_sector));
233 				last_print = jiffies;
234 			}
235 
236 			u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
237 
238 			if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
239 			    !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
240 				continue;
241 
242 			try_read_btree_node(w->f, ca, bio, buf, sector);
243 		}
244 err:
245 	bio_put(bio);
246 	free_page((unsigned long) buf);
247 	percpu_ref_get(&ca->io_ref);
248 	closure_put(w->cl);
249 	kfree(w);
250 	return 0;
251 }
252 
read_btree_nodes(struct find_btree_nodes * f)253 static int read_btree_nodes(struct find_btree_nodes *f)
254 {
255 	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
256 	struct closure cl;
257 	int ret = 0;
258 
259 	closure_init_stack(&cl);
260 
261 	for_each_online_member(c, ca) {
262 		if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
263 			continue;
264 
265 		struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
266 		struct task_struct *t;
267 
268 		if (!w) {
269 			percpu_ref_put(&ca->io_ref);
270 			ret = -ENOMEM;
271 			goto err;
272 		}
273 
274 		percpu_ref_get(&ca->io_ref);
275 		closure_get(&cl);
276 		w->cl		= &cl;
277 		w->f		= f;
278 		w->ca		= ca;
279 
280 		t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
281 		ret = PTR_ERR_OR_ZERO(t);
282 		if (ret) {
283 			percpu_ref_put(&ca->io_ref);
284 			closure_put(&cl);
285 			f->ret = ret;
286 			bch_err(c, "error starting kthread: %i", ret);
287 			break;
288 		}
289 	}
290 err:
291 	closure_sync(&cl);
292 	return f->ret ?: ret;
293 }
294 
bubble_up(struct found_btree_node * n,struct found_btree_node * end)295 static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
296 {
297 	while (n + 1 < end &&
298 	       found_btree_node_cmp_pos(n, n + 1) > 0) {
299 		swap(n[0], n[1]);
300 		n++;
301 	}
302 }
303 
handle_overwrites(struct bch_fs * c,struct found_btree_node * start,struct found_btree_node * end)304 static int handle_overwrites(struct bch_fs *c,
305 			     struct found_btree_node *start,
306 			     struct found_btree_node *end)
307 {
308 	struct found_btree_node *n;
309 again:
310 	for (n = start + 1;
311 	     n < end &&
312 	     n->btree_id	== start->btree_id &&
313 	     n->level		== start->level &&
314 	     bpos_lt(n->min_key, start->max_key);
315 	     n++)  {
316 		int cmp = found_btree_node_cmp_time(start, n);
317 
318 		if (cmp > 0) {
319 			if (bpos_cmp(start->max_key, n->max_key) >= 0)
320 				n->overwritten = true;
321 			else {
322 				n->range_updated = true;
323 				n->min_key = bpos_successor(start->max_key);
324 				n->range_updated = true;
325 				bubble_up(n, end);
326 				goto again;
327 			}
328 		} else if (cmp < 0) {
329 			BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
330 
331 			start->max_key = bpos_predecessor(n->min_key);
332 			start->range_updated = true;
333 		} else if (n->level) {
334 			n->overwritten = true;
335 		} else {
336 			if (bpos_cmp(start->max_key, n->max_key) >= 0)
337 				n->overwritten = true;
338 			else {
339 				n->range_updated = true;
340 				n->min_key = bpos_successor(start->max_key);
341 				n->range_updated = true;
342 				bubble_up(n, end);
343 				goto again;
344 			}
345 		}
346 	}
347 
348 	return 0;
349 }
350 
bch2_scan_for_btree_nodes(struct bch_fs * c)351 int bch2_scan_for_btree_nodes(struct bch_fs *c)
352 {
353 	struct find_btree_nodes *f = &c->found_btree_nodes;
354 	struct printbuf buf = PRINTBUF;
355 	size_t dst;
356 	int ret = 0;
357 
358 	if (f->nodes.nr)
359 		return 0;
360 
361 	mutex_init(&f->lock);
362 
363 	ret = read_btree_nodes(f);
364 	if (ret)
365 		return ret;
366 
367 	if (!f->nodes.nr) {
368 		bch_err(c, "%s: no btree nodes found", __func__);
369 		ret = -EINVAL;
370 		goto err;
371 	}
372 
373 	if (0 && c->opts.verbose) {
374 		printbuf_reset(&buf);
375 		prt_printf(&buf, "%s: nodes found:\n", __func__);
376 		found_btree_nodes_to_text(&buf, c, f->nodes);
377 		bch2_print_string_as_lines(KERN_INFO, buf.buf);
378 	}
379 
380 	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
381 
382 	dst = 0;
383 	darray_for_each(f->nodes, i) {
384 		struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
385 
386 		if (prev &&
387 		    prev->cookie == i->cookie) {
388 			if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
389 				bch_err(c, "%s: found too many replicas for btree node", __func__);
390 				ret = -EINVAL;
391 				goto err;
392 			}
393 			prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
394 		} else {
395 			f->nodes.data[dst++] = *i;
396 		}
397 	}
398 	f->nodes.nr = dst;
399 
400 	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
401 
402 	if (0 && c->opts.verbose) {
403 		printbuf_reset(&buf);
404 		prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
405 		found_btree_nodes_to_text(&buf, c, f->nodes);
406 		bch2_print_string_as_lines(KERN_INFO, buf.buf);
407 	}
408 
409 	dst = 0;
410 	darray_for_each(f->nodes, i) {
411 		if (i->overwritten)
412 			continue;
413 
414 		ret = handle_overwrites(c, i, &darray_top(f->nodes));
415 		if (ret)
416 			goto err;
417 
418 		BUG_ON(i->overwritten);
419 		f->nodes.data[dst++] = *i;
420 	}
421 	f->nodes.nr = dst;
422 
423 	if (c->opts.verbose) {
424 		printbuf_reset(&buf);
425 		prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
426 		found_btree_nodes_to_text(&buf, c, f->nodes);
427 		bch2_print_string_as_lines(KERN_INFO, buf.buf);
428 	}
429 
430 	eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
431 err:
432 	printbuf_exit(&buf);
433 	return ret;
434 }
435 
found_btree_node_range_start_cmp(const void * _l,const void * _r)436 static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
437 {
438 	const struct found_btree_node *l = _l;
439 	const struct found_btree_node *r = _r;
440 
441 	return  cmp_int(l->btree_id,	r->btree_id) ?:
442 	       -cmp_int(l->level,	r->level) ?:
443 		bpos_cmp(l->max_key,	r->min_key);
444 }
445 
446 #define for_each_found_btree_node_in_range(_f, _search, _idx)				\
447 	for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr,		\
448 					sizeof((_f)->nodes.data[0]),			\
449 					found_btree_node_range_start_cmp, &search);	\
450 	     _idx < (_f)->nodes.nr &&							\
451 	     (_f)->nodes.data[_idx].btree_id == _search.btree_id &&			\
452 	     (_f)->nodes.data[_idx].level == _search.level &&				\
453 	     bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key);			\
454 	     _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
455 
bch2_btree_node_is_stale(struct bch_fs * c,struct btree * b)456 bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
457 {
458 	struct find_btree_nodes *f = &c->found_btree_nodes;
459 
460 	struct found_btree_node search = {
461 		.btree_id	= b->c.btree_id,
462 		.level		= b->c.level,
463 		.min_key	= b->data->min_key,
464 		.max_key	= b->key.k.p,
465 	};
466 
467 	for_each_found_btree_node_in_range(f, search, idx)
468 		if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
469 			return true;
470 	return false;
471 }
472 
bch2_btree_has_scanned_nodes(struct bch_fs * c,enum btree_id btree)473 bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
474 {
475 	struct found_btree_node search = {
476 		.btree_id	= btree,
477 		.level		= 0,
478 		.min_key	= POS_MIN,
479 		.max_key	= SPOS_MAX,
480 	};
481 
482 	for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
483 		return true;
484 	return false;
485 }
486 
bch2_get_scanned_nodes(struct bch_fs * c,enum btree_id btree,unsigned level,struct bpos node_min,struct bpos node_max)487 int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
488 			   unsigned level, struct bpos node_min, struct bpos node_max)
489 {
490 	if (btree_id_is_alloc(btree))
491 		return 0;
492 
493 	struct find_btree_nodes *f = &c->found_btree_nodes;
494 
495 	int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
496 	if (ret)
497 		return ret;
498 
499 	if (c->opts.verbose) {
500 		struct printbuf buf = PRINTBUF;
501 
502 		prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
503 		bch2_bpos_to_text(&buf, node_min);
504 		prt_str(&buf, " - ");
505 		bch2_bpos_to_text(&buf, node_max);
506 
507 		bch_info(c, "%s(): %s", __func__, buf.buf);
508 		printbuf_exit(&buf);
509 	}
510 
511 	struct found_btree_node search = {
512 		.btree_id	= btree,
513 		.level		= level,
514 		.min_key	= node_min,
515 		.max_key	= node_max,
516 	};
517 
518 	for_each_found_btree_node_in_range(f, search, idx) {
519 		struct found_btree_node n = f->nodes.data[idx];
520 
521 		n.range_updated |= bpos_lt(n.min_key, node_min);
522 		n.min_key = bpos_max(n.min_key, node_min);
523 
524 		n.range_updated |= bpos_gt(n.max_key, node_max);
525 		n.max_key = bpos_min(n.max_key, node_max);
526 
527 		struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
528 
529 		found_btree_node_to_key(&tmp.k, &n);
530 
531 		struct printbuf buf = PRINTBUF;
532 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
533 		bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
534 		printbuf_exit(&buf);
535 
536 		BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0));
537 
538 		ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
539 		if (ret)
540 			return ret;
541 	}
542 
543 	return 0;
544 }
545 
bch2_find_btree_nodes_exit(struct find_btree_nodes * f)546 void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
547 {
548 	darray_exit(&f->nodes);
549 }
550