1  // SPDX-License-Identifier: GPL-2.0
2  
3  #include "bcachefs.h"
4  #include "buckets.h"
5  #include "disk_accounting.h"
6  #include "journal.h"
7  #include "replicas.h"
8  #include "super-io.h"
9  
10  #include <linux/sort.h>
11  
12  static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
13  					    struct bch_replicas_cpu *);
14  
15  /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
bch2_memcmp(const void * l,const void * r,const void * priv)16  static int bch2_memcmp(const void *l, const void *r,  const void *priv)
17  {
18  	size_t size = (size_t) priv;
19  	return memcmp(l, r, size);
20  }
21  
22  /* Replicas tracking - in memory: */
23  
verify_replicas_entry(struct bch_replicas_entry_v1 * e)24  static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
25  {
26  #ifdef CONFIG_BCACHEFS_DEBUG
27  	BUG_ON(!e->nr_devs);
28  	BUG_ON(e->nr_required > 1 &&
29  	       e->nr_required >= e->nr_devs);
30  
31  	for (unsigned i = 0; i + 1 < e->nr_devs; i++)
32  		BUG_ON(e->devs[i] >= e->devs[i + 1]);
33  #endif
34  }
35  
bch2_replicas_entry_sort(struct bch_replicas_entry_v1 * e)36  void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
37  {
38  	bubble_sort(e->devs, e->nr_devs, u8_cmp);
39  }
40  
bch2_cpu_replicas_sort(struct bch_replicas_cpu * r)41  static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
42  {
43  	eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
44  			  bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
45  }
46  
bch2_replicas_entry_v0_to_text(struct printbuf * out,struct bch_replicas_entry_v0 * e)47  static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
48  					   struct bch_replicas_entry_v0 *e)
49  {
50  	bch2_prt_data_type(out, e->data_type);
51  
52  	prt_printf(out, ": %u [", e->nr_devs);
53  	for (unsigned i = 0; i < e->nr_devs; i++)
54  		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
55  	prt_printf(out, "]");
56  }
57  
bch2_replicas_entry_to_text(struct printbuf * out,struct bch_replicas_entry_v1 * e)58  void bch2_replicas_entry_to_text(struct printbuf *out,
59  				 struct bch_replicas_entry_v1 *e)
60  {
61  	bch2_prt_data_type(out, e->data_type);
62  
63  	prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
64  	for (unsigned i = 0; i < e->nr_devs; i++)
65  		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
66  	prt_printf(out, "]");
67  }
68  
bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 * r,struct bch_sb * sb,struct printbuf * err)69  static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
70  					   struct bch_sb *sb,
71  					   struct printbuf *err)
72  {
73  	if (!r->nr_devs) {
74  		prt_printf(err, "no devices in entry ");
75  		goto bad;
76  	}
77  
78  	if (r->nr_required > 1 &&
79  	    r->nr_required >= r->nr_devs) {
80  		prt_printf(err, "bad nr_required in entry ");
81  		goto bad;
82  	}
83  
84  	for (unsigned i = 0; i < r->nr_devs; i++)
85  		if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
86  		    !bch2_member_exists(sb, r->devs[i])) {
87  			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
88  			goto bad;
89  		}
90  
91  	return 0;
92  bad:
93  	bch2_replicas_entry_to_text(err, r);
94  	return -BCH_ERR_invalid_replicas_entry;
95  }
96  
bch2_replicas_entry_validate(struct bch_replicas_entry_v1 * r,struct bch_fs * c,struct printbuf * err)97  int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
98  				 struct bch_fs *c,
99  				 struct printbuf *err)
100  {
101  	if (!r->nr_devs) {
102  		prt_printf(err, "no devices in entry ");
103  		goto bad;
104  	}
105  
106  	if (r->nr_required > 1 &&
107  	    r->nr_required >= r->nr_devs) {
108  		prt_printf(err, "bad nr_required in entry ");
109  		goto bad;
110  	}
111  
112  	for (unsigned i = 0; i < r->nr_devs; i++)
113  		if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
114  		    !bch2_dev_exists(c, r->devs[i])) {
115  			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
116  			goto bad;
117  		}
118  
119  	return 0;
120  bad:
121  	bch2_replicas_entry_to_text(err, r);
122  	return -BCH_ERR_invalid_replicas_entry;
123  }
124  
bch2_cpu_replicas_to_text(struct printbuf * out,struct bch_replicas_cpu * r)125  void bch2_cpu_replicas_to_text(struct printbuf *out,
126  			       struct bch_replicas_cpu *r)
127  {
128  	struct bch_replicas_entry_v1 *e;
129  	bool first = true;
130  
131  	for_each_cpu_replicas_entry(r, e) {
132  		if (!first)
133  			prt_printf(out, " ");
134  		first = false;
135  
136  		bch2_replicas_entry_to_text(out, e);
137  	}
138  }
139  
extent_to_replicas(struct bkey_s_c k,struct bch_replicas_entry_v1 * r)140  static void extent_to_replicas(struct bkey_s_c k,
141  			       struct bch_replicas_entry_v1 *r)
142  {
143  	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
144  	const union bch_extent_entry *entry;
145  	struct extent_ptr_decoded p;
146  
147  	r->nr_required	= 1;
148  
149  	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
150  		if (p.ptr.cached)
151  			continue;
152  
153  		if (!p.has_ec)
154  			replicas_entry_add_dev(r, p.ptr.dev);
155  		else
156  			r->nr_required = 0;
157  	}
158  }
159  
stripe_to_replicas(struct bkey_s_c k,struct bch_replicas_entry_v1 * r)160  static void stripe_to_replicas(struct bkey_s_c k,
161  			       struct bch_replicas_entry_v1 *r)
162  {
163  	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
164  	const struct bch_extent_ptr *ptr;
165  
166  	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
167  
168  	for (ptr = s.v->ptrs;
169  	     ptr < s.v->ptrs + s.v->nr_blocks;
170  	     ptr++)
171  		replicas_entry_add_dev(r, ptr->dev);
172  }
173  
bch2_bkey_to_replicas(struct bch_replicas_entry_v1 * e,struct bkey_s_c k)174  void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
175  			   struct bkey_s_c k)
176  {
177  	e->nr_devs = 0;
178  
179  	switch (k.k->type) {
180  	case KEY_TYPE_btree_ptr:
181  	case KEY_TYPE_btree_ptr_v2:
182  		e->data_type = BCH_DATA_btree;
183  		extent_to_replicas(k, e);
184  		break;
185  	case KEY_TYPE_extent:
186  	case KEY_TYPE_reflink_v:
187  		e->data_type = BCH_DATA_user;
188  		extent_to_replicas(k, e);
189  		break;
190  	case KEY_TYPE_stripe:
191  		e->data_type = BCH_DATA_parity;
192  		stripe_to_replicas(k, e);
193  		break;
194  	}
195  
196  	bch2_replicas_entry_sort(e);
197  }
198  
bch2_devlist_to_replicas(struct bch_replicas_entry_v1 * e,enum bch_data_type data_type,struct bch_devs_list devs)199  void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
200  			      enum bch_data_type data_type,
201  			      struct bch_devs_list devs)
202  {
203  	BUG_ON(!data_type ||
204  	       data_type == BCH_DATA_sb ||
205  	       data_type >= BCH_DATA_NR);
206  
207  	e->data_type	= data_type;
208  	e->nr_devs	= 0;
209  	e->nr_required	= 1;
210  
211  	darray_for_each(devs, i)
212  		replicas_entry_add_dev(e, *i);
213  
214  	bch2_replicas_entry_sort(e);
215  }
216  
217  static struct bch_replicas_cpu
cpu_replicas_add_entry(struct bch_fs * c,struct bch_replicas_cpu * old,struct bch_replicas_entry_v1 * new_entry)218  cpu_replicas_add_entry(struct bch_fs *c,
219  		       struct bch_replicas_cpu *old,
220  		       struct bch_replicas_entry_v1 *new_entry)
221  {
222  	struct bch_replicas_cpu new = {
223  		.nr		= old->nr + 1,
224  		.entry_size	= max_t(unsigned, old->entry_size,
225  					replicas_entry_bytes(new_entry)),
226  	};
227  
228  	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
229  	if (!new.entries)
230  		return new;
231  
232  	for (unsigned i = 0; i < old->nr; i++)
233  		memcpy(cpu_replicas_entry(&new, i),
234  		       cpu_replicas_entry(old, i),
235  		       old->entry_size);
236  
237  	memcpy(cpu_replicas_entry(&new, old->nr),
238  	       new_entry,
239  	       replicas_entry_bytes(new_entry));
240  
241  	bch2_cpu_replicas_sort(&new);
242  	return new;
243  }
244  
__replicas_entry_idx(struct bch_replicas_cpu * r,struct bch_replicas_entry_v1 * search)245  static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
246  				       struct bch_replicas_entry_v1 *search)
247  {
248  	int idx, entry_size = replicas_entry_bytes(search);
249  
250  	if (unlikely(entry_size > r->entry_size))
251  		return -1;
252  
253  #define entry_cmp(_l, _r)	memcmp(_l, _r, entry_size)
254  	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
255  			      entry_cmp, search);
256  #undef entry_cmp
257  
258  	return idx < r->nr ? idx : -1;
259  }
260  
bch2_replicas_entry_idx(struct bch_fs * c,struct bch_replicas_entry_v1 * search)261  int bch2_replicas_entry_idx(struct bch_fs *c,
262  			    struct bch_replicas_entry_v1 *search)
263  {
264  	bch2_replicas_entry_sort(search);
265  
266  	return __replicas_entry_idx(&c->replicas, search);
267  }
268  
__replicas_has_entry(struct bch_replicas_cpu * r,struct bch_replicas_entry_v1 * search)269  static bool __replicas_has_entry(struct bch_replicas_cpu *r,
270  				 struct bch_replicas_entry_v1 *search)
271  {
272  	return __replicas_entry_idx(r, search) >= 0;
273  }
274  
bch2_replicas_marked_locked(struct bch_fs * c,struct bch_replicas_entry_v1 * search)275  bool bch2_replicas_marked_locked(struct bch_fs *c,
276  			  struct bch_replicas_entry_v1 *search)
277  {
278  	verify_replicas_entry(search);
279  
280  	return !search->nr_devs ||
281  		(__replicas_has_entry(&c->replicas, search) &&
282  		 (likely((!c->replicas_gc.entries)) ||
283  		  __replicas_has_entry(&c->replicas_gc, search)));
284  }
285  
bch2_replicas_marked(struct bch_fs * c,struct bch_replicas_entry_v1 * search)286  bool bch2_replicas_marked(struct bch_fs *c,
287  			  struct bch_replicas_entry_v1 *search)
288  {
289  	percpu_down_read(&c->mark_lock);
290  	bool ret = bch2_replicas_marked_locked(c, search);
291  	percpu_up_read(&c->mark_lock);
292  
293  	return ret;
294  }
295  
296  noinline
bch2_mark_replicas_slowpath(struct bch_fs * c,struct bch_replicas_entry_v1 * new_entry)297  static int bch2_mark_replicas_slowpath(struct bch_fs *c,
298  				struct bch_replicas_entry_v1 *new_entry)
299  {
300  	struct bch_replicas_cpu new_r, new_gc;
301  	int ret = 0;
302  
303  	verify_replicas_entry(new_entry);
304  
305  	memset(&new_r, 0, sizeof(new_r));
306  	memset(&new_gc, 0, sizeof(new_gc));
307  
308  	mutex_lock(&c->sb_lock);
309  
310  	if (c->replicas_gc.entries &&
311  	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
312  		new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
313  		if (!new_gc.entries) {
314  			ret = -BCH_ERR_ENOMEM_cpu_replicas;
315  			goto err;
316  		}
317  	}
318  
319  	if (!__replicas_has_entry(&c->replicas, new_entry)) {
320  		new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
321  		if (!new_r.entries) {
322  			ret = -BCH_ERR_ENOMEM_cpu_replicas;
323  			goto err;
324  		}
325  
326  		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
327  		if (ret)
328  			goto err;
329  	}
330  
331  	if (!new_r.entries &&
332  	    !new_gc.entries)
333  		goto out;
334  
335  	/* allocations done, now commit: */
336  
337  	if (new_r.entries)
338  		bch2_write_super(c);
339  
340  	/* don't update in memory replicas until changes are persistent */
341  	percpu_down_write(&c->mark_lock);
342  	if (new_r.entries)
343  		swap(c->replicas, new_r);
344  	if (new_gc.entries)
345  		swap(new_gc, c->replicas_gc);
346  	percpu_up_write(&c->mark_lock);
347  out:
348  	mutex_unlock(&c->sb_lock);
349  
350  	kfree(new_r.entries);
351  	kfree(new_gc.entries);
352  
353  	return ret;
354  err:
355  	bch_err_msg(c, ret, "adding replicas entry");
356  	goto out;
357  }
358  
bch2_mark_replicas(struct bch_fs * c,struct bch_replicas_entry_v1 * r)359  int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
360  {
361  	return likely(bch2_replicas_marked(c, r))
362  		? 0 : bch2_mark_replicas_slowpath(c, r);
363  }
364  
365  /*
366   * Old replicas_gc mechanism: only used for journal replicas entries now, should
367   * die at some point:
368   */
369  
bch2_replicas_gc_end(struct bch_fs * c,int ret)370  int bch2_replicas_gc_end(struct bch_fs *c, int ret)
371  {
372  	lockdep_assert_held(&c->replicas_gc_lock);
373  
374  	mutex_lock(&c->sb_lock);
375  	percpu_down_write(&c->mark_lock);
376  
377  	ret =   ret ?:
378  		bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
379  	if (!ret)
380  		swap(c->replicas, c->replicas_gc);
381  
382  	kfree(c->replicas_gc.entries);
383  	c->replicas_gc.entries = NULL;
384  
385  	percpu_up_write(&c->mark_lock);
386  
387  	if (!ret)
388  		bch2_write_super(c);
389  
390  	mutex_unlock(&c->sb_lock);
391  
392  	return ret;
393  }
394  
bch2_replicas_gc_start(struct bch_fs * c,unsigned typemask)395  int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
396  {
397  	struct bch_replicas_entry_v1 *e;
398  	unsigned i = 0;
399  
400  	lockdep_assert_held(&c->replicas_gc_lock);
401  
402  	mutex_lock(&c->sb_lock);
403  	BUG_ON(c->replicas_gc.entries);
404  
405  	c->replicas_gc.nr		= 0;
406  	c->replicas_gc.entry_size	= 0;
407  
408  	for_each_cpu_replicas_entry(&c->replicas, e) {
409  		/* Preserve unknown data types */
410  		if (e->data_type >= BCH_DATA_NR ||
411  		    !((1 << e->data_type) & typemask)) {
412  			c->replicas_gc.nr++;
413  			c->replicas_gc.entry_size =
414  				max_t(unsigned, c->replicas_gc.entry_size,
415  				      replicas_entry_bytes(e));
416  		}
417  	}
418  
419  	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
420  					 c->replicas_gc.entry_size,
421  					 GFP_KERNEL);
422  	if (!c->replicas_gc.entries) {
423  		mutex_unlock(&c->sb_lock);
424  		bch_err(c, "error allocating c->replicas_gc");
425  		return -BCH_ERR_ENOMEM_replicas_gc;
426  	}
427  
428  	for_each_cpu_replicas_entry(&c->replicas, e)
429  		if (e->data_type >= BCH_DATA_NR ||
430  		    !((1 << e->data_type) & typemask))
431  			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
432  			       e, c->replicas_gc.entry_size);
433  
434  	bch2_cpu_replicas_sort(&c->replicas_gc);
435  	mutex_unlock(&c->sb_lock);
436  
437  	return 0;
438  }
439  
440  /*
441   * New much simpler mechanism for clearing out unneeded replicas entries - drop
442   * replicas entries that have 0 sectors used.
443   *
444   * However, we don't track sector counts for journal usage, so this doesn't drop
445   * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
446   * is retained for that.
447   */
bch2_replicas_gc2(struct bch_fs * c)448  int bch2_replicas_gc2(struct bch_fs *c)
449  {
450  	struct bch_replicas_cpu new = { 0 };
451  	unsigned nr;
452  	int ret = 0;
453  
454  	bch2_accounting_mem_gc(c);
455  retry:
456  	nr		= READ_ONCE(c->replicas.nr);
457  	new.entry_size	= READ_ONCE(c->replicas.entry_size);
458  	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
459  	if (!new.entries) {
460  		bch_err(c, "error allocating c->replicas_gc");
461  		return -BCH_ERR_ENOMEM_replicas_gc;
462  	}
463  
464  	mutex_lock(&c->sb_lock);
465  	percpu_down_write(&c->mark_lock);
466  
467  	if (nr			!= c->replicas.nr ||
468  	    new.entry_size	!= c->replicas.entry_size) {
469  		percpu_up_write(&c->mark_lock);
470  		mutex_unlock(&c->sb_lock);
471  		kfree(new.entries);
472  		goto retry;
473  	}
474  
475  	for (unsigned i = 0; i < c->replicas.nr; i++) {
476  		struct bch_replicas_entry_v1 *e =
477  			cpu_replicas_entry(&c->replicas, i);
478  
479  		struct disk_accounting_pos k = {
480  			.type = BCH_DISK_ACCOUNTING_replicas,
481  		};
482  
483  		unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
484  			      "embedded variable length struct");
485  
486  		struct bpos p = disk_accounting_pos_to_bpos(&k);
487  
488  		struct bch_accounting_mem *acc = &c->accounting;
489  		bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
490  					    accounting_pos_cmp, &p) >= acc->k.nr;
491  
492  		if (e->data_type == BCH_DATA_journal || !kill)
493  			memcpy(cpu_replicas_entry(&new, new.nr++),
494  			       e, new.entry_size);
495  	}
496  
497  	bch2_cpu_replicas_sort(&new);
498  
499  	ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
500  
501  	if (!ret)
502  		swap(c->replicas, new);
503  
504  	kfree(new.entries);
505  
506  	percpu_up_write(&c->mark_lock);
507  
508  	if (!ret)
509  		bch2_write_super(c);
510  
511  	mutex_unlock(&c->sb_lock);
512  
513  	return ret;
514  }
515  
516  /* Replicas tracking - superblock: */
517  
518  static int
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas * sb_r,struct bch_replicas_cpu * cpu_r)519  __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
520  				   struct bch_replicas_cpu *cpu_r)
521  {
522  	struct bch_replicas_entry_v1 *e, *dst;
523  	unsigned nr = 0, entry_size = 0, idx = 0;
524  
525  	for_each_replicas_entry(sb_r, e) {
526  		entry_size = max_t(unsigned, entry_size,
527  				   replicas_entry_bytes(e));
528  		nr++;
529  	}
530  
531  	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
532  	if (!cpu_r->entries)
533  		return -BCH_ERR_ENOMEM_cpu_replicas;
534  
535  	cpu_r->nr		= nr;
536  	cpu_r->entry_size	= entry_size;
537  
538  	for_each_replicas_entry(sb_r, e) {
539  		dst = cpu_replicas_entry(cpu_r, idx++);
540  		memcpy(dst, e, replicas_entry_bytes(e));
541  		bch2_replicas_entry_sort(dst);
542  	}
543  
544  	return 0;
545  }
546  
547  static int
__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 * sb_r,struct bch_replicas_cpu * cpu_r)548  __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
549  				      struct bch_replicas_cpu *cpu_r)
550  {
551  	struct bch_replicas_entry_v0 *e;
552  	unsigned nr = 0, entry_size = 0, idx = 0;
553  
554  	for_each_replicas_entry(sb_r, e) {
555  		entry_size = max_t(unsigned, entry_size,
556  				   replicas_entry_bytes(e));
557  		nr++;
558  	}
559  
560  	entry_size += sizeof(struct bch_replicas_entry_v1) -
561  		sizeof(struct bch_replicas_entry_v0);
562  
563  	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
564  	if (!cpu_r->entries)
565  		return -BCH_ERR_ENOMEM_cpu_replicas;
566  
567  	cpu_r->nr		= nr;
568  	cpu_r->entry_size	= entry_size;
569  
570  	for_each_replicas_entry(sb_r, e) {
571  		struct bch_replicas_entry_v1 *dst =
572  			cpu_replicas_entry(cpu_r, idx++);
573  
574  		dst->data_type	= e->data_type;
575  		dst->nr_devs	= e->nr_devs;
576  		dst->nr_required = 1;
577  		memcpy(dst->devs, e->devs, e->nr_devs);
578  		bch2_replicas_entry_sort(dst);
579  	}
580  
581  	return 0;
582  }
583  
bch2_sb_replicas_to_cpu_replicas(struct bch_fs * c)584  int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
585  {
586  	struct bch_sb_field_replicas *sb_v1;
587  	struct bch_sb_field_replicas_v0 *sb_v0;
588  	struct bch_replicas_cpu new_r = { 0, 0, NULL };
589  	int ret = 0;
590  
591  	if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
592  		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
593  	else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
594  		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
595  	if (ret)
596  		return ret;
597  
598  	bch2_cpu_replicas_sort(&new_r);
599  
600  	percpu_down_write(&c->mark_lock);
601  	swap(c->replicas, new_r);
602  	percpu_up_write(&c->mark_lock);
603  
604  	kfree(new_r.entries);
605  
606  	return 0;
607  }
608  
bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs * c,struct bch_replicas_cpu * r)609  static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
610  					       struct bch_replicas_cpu *r)
611  {
612  	struct bch_sb_field_replicas_v0 *sb_r;
613  	struct bch_replicas_entry_v0 *dst;
614  	struct bch_replicas_entry_v1 *src;
615  	size_t bytes;
616  
617  	bytes = sizeof(struct bch_sb_field_replicas);
618  
619  	for_each_cpu_replicas_entry(r, src)
620  		bytes += replicas_entry_bytes(src) - 1;
621  
622  	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
623  			DIV_ROUND_UP(bytes, sizeof(u64)));
624  	if (!sb_r)
625  		return -BCH_ERR_ENOSPC_sb_replicas;
626  
627  	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
628  	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
629  
630  	memset(&sb_r->entries, 0,
631  	       vstruct_end(&sb_r->field) -
632  	       (void *) &sb_r->entries);
633  
634  	dst = sb_r->entries;
635  	for_each_cpu_replicas_entry(r, src) {
636  		dst->data_type	= src->data_type;
637  		dst->nr_devs	= src->nr_devs;
638  		memcpy(dst->devs, src->devs, src->nr_devs);
639  
640  		dst = replicas_entry_next(dst);
641  
642  		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
643  	}
644  
645  	return 0;
646  }
647  
bch2_cpu_replicas_to_sb_replicas(struct bch_fs * c,struct bch_replicas_cpu * r)648  static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
649  					    struct bch_replicas_cpu *r)
650  {
651  	struct bch_sb_field_replicas *sb_r;
652  	struct bch_replicas_entry_v1 *dst, *src;
653  	bool need_v1 = false;
654  	size_t bytes;
655  
656  	bytes = sizeof(struct bch_sb_field_replicas);
657  
658  	for_each_cpu_replicas_entry(r, src) {
659  		bytes += replicas_entry_bytes(src);
660  		if (src->nr_required != 1)
661  			need_v1 = true;
662  	}
663  
664  	if (!need_v1)
665  		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
666  
667  	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
668  			DIV_ROUND_UP(bytes, sizeof(u64)));
669  	if (!sb_r)
670  		return -BCH_ERR_ENOSPC_sb_replicas;
671  
672  	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
673  	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
674  
675  	memset(&sb_r->entries, 0,
676  	       vstruct_end(&sb_r->field) -
677  	       (void *) &sb_r->entries);
678  
679  	dst = sb_r->entries;
680  	for_each_cpu_replicas_entry(r, src) {
681  		memcpy(dst, src, replicas_entry_bytes(src));
682  
683  		dst = replicas_entry_next(dst);
684  
685  		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
686  	}
687  
688  	return 0;
689  }
690  
bch2_cpu_replicas_validate(struct bch_replicas_cpu * cpu_r,struct bch_sb * sb,struct printbuf * err)691  static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
692  				      struct bch_sb *sb,
693  				      struct printbuf *err)
694  {
695  	unsigned i;
696  
697  	sort_r(cpu_r->entries,
698  	       cpu_r->nr,
699  	       cpu_r->entry_size,
700  	       bch2_memcmp, NULL,
701  	       (void *)(size_t)cpu_r->entry_size);
702  
703  	for (i = 0; i < cpu_r->nr; i++) {
704  		struct bch_replicas_entry_v1 *e =
705  			cpu_replicas_entry(cpu_r, i);
706  
707  		int ret = bch2_replicas_entry_sb_validate(e, sb, err);
708  		if (ret)
709  			return ret;
710  
711  		if (i + 1 < cpu_r->nr) {
712  			struct bch_replicas_entry_v1 *n =
713  				cpu_replicas_entry(cpu_r, i + 1);
714  
715  			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
716  
717  			if (!memcmp(e, n, cpu_r->entry_size)) {
718  				prt_printf(err, "duplicate replicas entry ");
719  				bch2_replicas_entry_to_text(err, e);
720  				return -BCH_ERR_invalid_sb_replicas;
721  			}
722  		}
723  	}
724  
725  	return 0;
726  }
727  
bch2_sb_replicas_validate(struct bch_sb * sb,struct bch_sb_field * f,enum bch_validate_flags flags,struct printbuf * err)728  static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
729  				     enum bch_validate_flags flags, struct printbuf *err)
730  {
731  	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
732  	struct bch_replicas_cpu cpu_r;
733  	int ret;
734  
735  	ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
736  	if (ret)
737  		return ret;
738  
739  	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
740  	kfree(cpu_r.entries);
741  	return ret;
742  }
743  
bch2_sb_replicas_to_text(struct printbuf * out,struct bch_sb * sb,struct bch_sb_field * f)744  static void bch2_sb_replicas_to_text(struct printbuf *out,
745  				     struct bch_sb *sb,
746  				     struct bch_sb_field *f)
747  {
748  	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
749  	struct bch_replicas_entry_v1 *e;
750  	bool first = true;
751  
752  	for_each_replicas_entry(r, e) {
753  		if (!first)
754  			prt_printf(out, " ");
755  		first = false;
756  
757  		bch2_replicas_entry_to_text(out, e);
758  	}
759  	prt_newline(out);
760  }
761  
762  const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
763  	.validate	= bch2_sb_replicas_validate,
764  	.to_text	= bch2_sb_replicas_to_text,
765  };
766  
bch2_sb_replicas_v0_validate(struct bch_sb * sb,struct bch_sb_field * f,enum bch_validate_flags flags,struct printbuf * err)767  static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
768  					enum bch_validate_flags flags, struct printbuf *err)
769  {
770  	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
771  	struct bch_replicas_cpu cpu_r;
772  	int ret;
773  
774  	ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
775  	if (ret)
776  		return ret;
777  
778  	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
779  	kfree(cpu_r.entries);
780  	return ret;
781  }
782  
bch2_sb_replicas_v0_to_text(struct printbuf * out,struct bch_sb * sb,struct bch_sb_field * f)783  static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
784  					struct bch_sb *sb,
785  					struct bch_sb_field *f)
786  {
787  	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
788  	struct bch_replicas_entry_v0 *e;
789  	bool first = true;
790  
791  	for_each_replicas_entry(sb_r, e) {
792  		if (!first)
793  			prt_printf(out, " ");
794  		first = false;
795  
796  		bch2_replicas_entry_v0_to_text(out, e);
797  	}
798  	prt_newline(out);
799  }
800  
801  const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
802  	.validate	= bch2_sb_replicas_v0_validate,
803  	.to_text	= bch2_sb_replicas_v0_to_text,
804  };
805  
806  /* Query replicas: */
807  
bch2_have_enough_devs(struct bch_fs * c,struct bch_devs_mask devs,unsigned flags,bool print)808  bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
809  			   unsigned flags, bool print)
810  {
811  	struct bch_replicas_entry_v1 *e;
812  	bool ret = true;
813  
814  	percpu_down_read(&c->mark_lock);
815  	for_each_cpu_replicas_entry(&c->replicas, e) {
816  		unsigned nr_online = 0, nr_failed = 0, dflags = 0;
817  		bool metadata = e->data_type < BCH_DATA_user;
818  
819  		if (e->data_type == BCH_DATA_cached)
820  			continue;
821  
822  		rcu_read_lock();
823  		for (unsigned i = 0; i < e->nr_devs; i++) {
824  			if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
825  				nr_failed++;
826  				continue;
827  			}
828  
829  			nr_online += test_bit(e->devs[i], devs.d);
830  
831  			struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
832  			nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
833  		}
834  		rcu_read_unlock();
835  
836  		if (nr_online + nr_failed == e->nr_devs)
837  			continue;
838  
839  		if (nr_online < e->nr_required)
840  			dflags |= metadata
841  				? BCH_FORCE_IF_METADATA_LOST
842  				: BCH_FORCE_IF_DATA_LOST;
843  
844  		if (nr_online < e->nr_devs)
845  			dflags |= metadata
846  				? BCH_FORCE_IF_METADATA_DEGRADED
847  				: BCH_FORCE_IF_DATA_DEGRADED;
848  
849  		if (dflags & ~flags) {
850  			if (print) {
851  				struct printbuf buf = PRINTBUF;
852  
853  				bch2_replicas_entry_to_text(&buf, e);
854  				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
855  					nr_online, buf.buf);
856  				printbuf_exit(&buf);
857  			}
858  			ret = false;
859  			break;
860  		}
861  
862  	}
863  	percpu_up_read(&c->mark_lock);
864  
865  	return ret;
866  }
867  
bch2_sb_dev_has_data(struct bch_sb * sb,unsigned dev)868  unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
869  {
870  	struct bch_sb_field_replicas *replicas;
871  	struct bch_sb_field_replicas_v0 *replicas_v0;
872  	unsigned data_has = 0;
873  
874  	replicas = bch2_sb_field_get(sb, replicas);
875  	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
876  
877  	if (replicas) {
878  		struct bch_replicas_entry_v1 *r;
879  
880  		for_each_replicas_entry(replicas, r) {
881  			if (r->data_type >= sizeof(data_has) * 8)
882  				continue;
883  
884  			for (unsigned i = 0; i < r->nr_devs; i++)
885  				if (r->devs[i] == dev)
886  					data_has |= 1 << r->data_type;
887  		}
888  
889  	} else if (replicas_v0) {
890  		struct bch_replicas_entry_v0 *r;
891  
892  		for_each_replicas_entry_v0(replicas_v0, r) {
893  			if (r->data_type >= sizeof(data_has) * 8)
894  				continue;
895  
896  			for (unsigned i = 0; i < r->nr_devs; i++)
897  				if (r->devs[i] == dev)
898  					data_has |= 1 << r->data_type;
899  		}
900  	}
901  
902  
903  	return data_has;
904  }
905  
bch2_dev_has_data(struct bch_fs * c,struct bch_dev * ca)906  unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
907  {
908  	mutex_lock(&c->sb_lock);
909  	unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
910  	mutex_unlock(&c->sb_lock);
911  
912  	return ret;
913  }
914  
bch2_fs_replicas_exit(struct bch_fs * c)915  void bch2_fs_replicas_exit(struct bch_fs *c)
916  {
917  	kfree(c->replicas.entries);
918  	kfree(c->replicas_gc.entries);
919  }
920