1  // SPDX-License-Identifier: GPL-2.0
2  
3  /* erasure coding */
4  
5  #include "bcachefs.h"
6  #include "alloc_background.h"
7  #include "alloc_foreground.h"
8  #include "backpointers.h"
9  #include "bkey_buf.h"
10  #include "bset.h"
11  #include "btree_gc.h"
12  #include "btree_update.h"
13  #include "btree_write_buffer.h"
14  #include "buckets.h"
15  #include "checksum.h"
16  #include "disk_accounting.h"
17  #include "disk_groups.h"
18  #include "ec.h"
19  #include "error.h"
20  #include "io_read.h"
21  #include "io_write.h"
22  #include "keylist.h"
23  #include "recovery.h"
24  #include "replicas.h"
25  #include "super-io.h"
26  #include "util.h"
27  
28  #include <linux/sort.h>
29  
30  #ifdef __KERNEL__
31  
32  #include <linux/raid/pq.h>
33  #include <linux/raid/xor.h>
34  
raid5_recov(unsigned disks,unsigned failed_idx,size_t size,void ** data)35  static void raid5_recov(unsigned disks, unsigned failed_idx,
36  			size_t size, void **data)
37  {
38  	unsigned i = 2, nr;
39  
40  	BUG_ON(failed_idx >= disks);
41  
42  	swap(data[0], data[failed_idx]);
43  	memcpy(data[0], data[1], size);
44  
45  	while (i < disks) {
46  		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
47  		xor_blocks(nr, size, data[0], data + i);
48  		i += nr;
49  	}
50  
51  	swap(data[0], data[failed_idx]);
52  }
53  
raid_gen(int nd,int np,size_t size,void ** v)54  static void raid_gen(int nd, int np, size_t size, void **v)
55  {
56  	if (np >= 1)
57  		raid5_recov(nd + np, nd, size, v);
58  	if (np >= 2)
59  		raid6_call.gen_syndrome(nd + np, size, v);
60  	BUG_ON(np > 2);
61  }
62  
raid_rec(int nr,int * ir,int nd,int np,size_t size,void ** v)63  static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
64  {
65  	switch (nr) {
66  	case 0:
67  		break;
68  	case 1:
69  		if (ir[0] < nd + 1)
70  			raid5_recov(nd + 1, ir[0], size, v);
71  		else
72  			raid6_call.gen_syndrome(nd + np, size, v);
73  		break;
74  	case 2:
75  		if (ir[1] < nd) {
76  			/* data+data failure. */
77  			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
78  		} else if (ir[0] < nd) {
79  			/* data + p/q failure */
80  
81  			if (ir[1] == nd) /* data + p failure */
82  				raid6_datap_recov(nd + np, size, ir[0], v);
83  			else { /* data + q failure */
84  				raid5_recov(nd + 1, ir[0], size, v);
85  				raid6_call.gen_syndrome(nd + np, size, v);
86  			}
87  		} else {
88  			raid_gen(nd, np, size, v);
89  		}
90  		break;
91  	default:
92  		BUG();
93  	}
94  }
95  
96  #else
97  
98  #include <raid/raid.h>
99  
100  #endif
101  
102  struct ec_bio {
103  	struct bch_dev		*ca;
104  	struct ec_stripe_buf	*buf;
105  	size_t			idx;
106  	struct bio		bio;
107  };
108  
109  /* Stripes btree keys: */
110  
bch2_stripe_validate(struct bch_fs * c,struct bkey_s_c k,enum bch_validate_flags flags)111  int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
112  			 enum bch_validate_flags flags)
113  {
114  	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
115  	int ret = 0;
116  
117  	bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
118  			 bpos_gt(k.k->p, POS(0, U32_MAX)),
119  			 c, stripe_pos_bad,
120  			 "stripe at bad pos");
121  
122  	bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
123  			 c, stripe_val_size_bad,
124  			 "incorrect value size (%zu < %u)",
125  			 bkey_val_u64s(k.k), stripe_val_u64s(s));
126  
127  	bkey_fsck_err_on(s->csum_granularity_bits >= 64,
128  			 c, stripe_csum_granularity_bad,
129  			 "invalid csum granularity (%u >= 64)",
130  			 s->csum_granularity_bits);
131  
132  	ret = bch2_bkey_ptrs_validate(c, k, flags);
133  fsck_err:
134  	return ret;
135  }
136  
bch2_stripe_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)137  void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
138  			 struct bkey_s_c k)
139  {
140  	const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
141  	struct bch_stripe s = {};
142  
143  	memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
144  
145  	unsigned nr_data = s.nr_blocks - s.nr_redundant;
146  
147  	prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
148  		   s.algorithm,
149  		   le16_to_cpu(s.sectors),
150  		   nr_data,
151  		   s.nr_redundant);
152  	bch2_prt_csum_type(out, s.csum_type);
153  	prt_str(out, " gran ");
154  	if (s.csum_granularity_bits < 64)
155  		prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
156  	else
157  		prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
158  
159  	if (s.disk_label) {
160  		prt_str(out, " label");
161  		bch2_disk_path_to_text(out, c, s.disk_label - 1);
162  	}
163  
164  	for (unsigned i = 0; i < s.nr_blocks; i++) {
165  		const struct bch_extent_ptr *ptr = sp->ptrs + i;
166  
167  		if ((void *) ptr >= bkey_val_end(k))
168  			break;
169  
170  		prt_char(out, ' ');
171  		bch2_extent_ptr_to_text(out, c, ptr);
172  
173  		if (s.csum_type < BCH_CSUM_NR &&
174  		    i < nr_data &&
175  		    stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
176  			prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
177  	}
178  }
179  
180  /* Triggers: */
181  
__mark_stripe_bucket(struct btree_trans * trans,struct bch_dev * ca,struct bkey_s_c_stripe s,unsigned ptr_idx,bool deleting,struct bpos bucket,struct bch_alloc_v4 * a,enum btree_iter_update_trigger_flags flags)182  static int __mark_stripe_bucket(struct btree_trans *trans,
183  				struct bch_dev *ca,
184  				struct bkey_s_c_stripe s,
185  				unsigned ptr_idx, bool deleting,
186  				struct bpos bucket,
187  				struct bch_alloc_v4 *a,
188  				enum btree_iter_update_trigger_flags flags)
189  {
190  	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
191  	unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
192  	bool parity = ptr_idx >= nr_data;
193  	enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
194  	s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
195  	struct printbuf buf = PRINTBUF;
196  	int ret = 0;
197  
198  	struct bch_fs *c = trans->c;
199  	if (deleting)
200  		sectors = -sectors;
201  
202  	if (!deleting) {
203  		if (bch2_trans_inconsistent_on(a->stripe ||
204  					       a->stripe_redundancy, trans,
205  				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
206  				bucket.inode, bucket.offset, a->gen,
207  				bch2_data_type_str(a->data_type),
208  				a->dirty_sectors,
209  				a->stripe, s.k->p.offset,
210  				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
211  			ret = -BCH_ERR_mark_stripe;
212  			goto err;
213  		}
214  
215  		if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
216  				"bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
217  				bucket.inode, bucket.offset, a->gen,
218  				bch2_data_type_str(a->data_type),
219  				a->dirty_sectors,
220  				a->cached_sectors,
221  				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
222  			ret = -BCH_ERR_mark_stripe;
223  			goto err;
224  		}
225  	} else {
226  		if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
227  					       a->stripe_redundancy != s.v->nr_redundant, trans,
228  				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
229  				bucket.inode, bucket.offset, a->gen,
230  				a->stripe,
231  				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
232  			ret = -BCH_ERR_mark_stripe;
233  			goto err;
234  		}
235  
236  		if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
237  				"bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
238  				bucket.inode, bucket.offset, a->gen,
239  				bch2_data_type_str(a->data_type),
240  				bch2_data_type_str(data_type),
241  				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
242  			ret = -BCH_ERR_mark_stripe;
243  			goto err;
244  		}
245  
246  		if (bch2_trans_inconsistent_on(parity &&
247  					       (a->dirty_sectors != -sectors ||
248  						a->cached_sectors), trans,
249  				"bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
250  				bucket.inode, bucket.offset, a->gen,
251  				a->dirty_sectors,
252  				a->cached_sectors,
253  				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
254  			ret = -BCH_ERR_mark_stripe;
255  			goto err;
256  		}
257  	}
258  
259  	if (sectors) {
260  		ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
261  					     a->gen, a->data_type, &a->dirty_sectors);
262  		if (ret)
263  			goto err;
264  	}
265  
266  	if (!deleting) {
267  		a->stripe		= s.k->p.offset;
268  		a->stripe_redundancy	= s.v->nr_redundant;
269  		alloc_data_type_set(a, data_type);
270  	} else {
271  		a->stripe		= 0;
272  		a->stripe_redundancy	= 0;
273  		alloc_data_type_set(a, BCH_DATA_user);
274  	}
275  err:
276  	printbuf_exit(&buf);
277  	return ret;
278  }
279  
mark_stripe_bucket(struct btree_trans * trans,struct bkey_s_c_stripe s,unsigned ptr_idx,bool deleting,enum btree_iter_update_trigger_flags flags)280  static int mark_stripe_bucket(struct btree_trans *trans,
281  			      struct bkey_s_c_stripe s,
282  			      unsigned ptr_idx, bool deleting,
283  			      enum btree_iter_update_trigger_flags flags)
284  {
285  	struct bch_fs *c = trans->c;
286  	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
287  	struct printbuf buf = PRINTBUF;
288  	int ret = 0;
289  
290  	struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
291  	if (unlikely(!ca)) {
292  		if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
293  			ret = -BCH_ERR_mark_stripe;
294  		goto err;
295  	}
296  
297  	struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
298  
299  	if (flags & BTREE_TRIGGER_transactional) {
300  		struct bkey_i_alloc_v4 *a =
301  			bch2_trans_start_alloc_update(trans, bucket, 0);
302  		ret = PTR_ERR_OR_ZERO(a) ?:
303  			__mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
304  	}
305  
306  	if (flags & BTREE_TRIGGER_gc) {
307  		percpu_down_read(&c->mark_lock);
308  		struct bucket *g = gc_bucket(ca, bucket.offset);
309  		if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n  %s",
310  					    ptr->dev,
311  					    (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
312  			ret = -BCH_ERR_mark_stripe;
313  			goto err_unlock;
314  		}
315  
316  		bucket_lock(g);
317  		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
318  		ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
319  		alloc_to_bucket(g, new);
320  		bucket_unlock(g);
321  err_unlock:
322  		percpu_up_read(&c->mark_lock);
323  		if (!ret)
324  			ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
325  	}
326  err:
327  	bch2_dev_put(ca);
328  	printbuf_exit(&buf);
329  	return ret;
330  }
331  
mark_stripe_buckets(struct btree_trans * trans,struct bkey_s_c old,struct bkey_s_c new,enum btree_iter_update_trigger_flags flags)332  static int mark_stripe_buckets(struct btree_trans *trans,
333  			       struct bkey_s_c old, struct bkey_s_c new,
334  			       enum btree_iter_update_trigger_flags flags)
335  {
336  	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
337  		? bkey_s_c_to_stripe(old).v : NULL;
338  	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
339  		? bkey_s_c_to_stripe(new).v : NULL;
340  
341  	BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
342  
343  	unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
344  
345  	for (unsigned i = 0; i < nr_blocks; i++) {
346  		if (new_s && old_s &&
347  		    !memcmp(&new_s->ptrs[i],
348  			    &old_s->ptrs[i],
349  			    sizeof(new_s->ptrs[i])))
350  			continue;
351  
352  		if (new_s) {
353  			int ret = mark_stripe_bucket(trans,
354  					bkey_s_c_to_stripe(new), i, false, flags);
355  			if (ret)
356  				return ret;
357  		}
358  
359  		if (old_s) {
360  			int ret = mark_stripe_bucket(trans,
361  					bkey_s_c_to_stripe(old), i, true, flags);
362  			if (ret)
363  				return ret;
364  		}
365  	}
366  
367  	return 0;
368  }
369  
stripe_to_mem(struct stripe * m,const struct bch_stripe * s)370  static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
371  {
372  	m->sectors	= le16_to_cpu(s->sectors);
373  	m->algorithm	= s->algorithm;
374  	m->nr_blocks	= s->nr_blocks;
375  	m->nr_redundant	= s->nr_redundant;
376  	m->disk_label	= s->disk_label;
377  	m->blocks_nonempty = 0;
378  
379  	for (unsigned i = 0; i < s->nr_blocks; i++)
380  		m->blocks_nonempty += !!stripe_blockcount_get(s, i);
381  }
382  
bch2_trigger_stripe(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s _new,enum btree_iter_update_trigger_flags flags)383  int bch2_trigger_stripe(struct btree_trans *trans,
384  			enum btree_id btree, unsigned level,
385  			struct bkey_s_c old, struct bkey_s _new,
386  			enum btree_iter_update_trigger_flags flags)
387  {
388  	struct bkey_s_c new = _new.s_c;
389  	struct bch_fs *c = trans->c;
390  	u64 idx = new.k->p.offset;
391  	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
392  		? bkey_s_c_to_stripe(old).v : NULL;
393  	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
394  		? bkey_s_c_to_stripe(new).v : NULL;
395  
396  	if (unlikely(flags & BTREE_TRIGGER_check_repair))
397  		return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
398  
399  	BUG_ON(new_s && old_s &&
400  	       (new_s->nr_blocks	!= old_s->nr_blocks ||
401  		new_s->nr_redundant	!= old_s->nr_redundant));
402  
403  
404  	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
405  		/*
406  		 * If the pointers aren't changing, we don't need to do anything:
407  		 */
408  		if (new_s && old_s &&
409  		    new_s->nr_blocks	== old_s->nr_blocks &&
410  		    new_s->nr_redundant	== old_s->nr_redundant &&
411  		    !memcmp(old_s->ptrs, new_s->ptrs,
412  			    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
413  			return 0;
414  
415  		struct gc_stripe *gc = NULL;
416  		if (flags & BTREE_TRIGGER_gc) {
417  			gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
418  			if (!gc) {
419  				bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
420  				return -BCH_ERR_ENOMEM_mark_stripe;
421  			}
422  
423  			/*
424  			 * This will be wrong when we bring back runtime gc: we should
425  			 * be unmarking the old key and then marking the new key
426  			 *
427  			 * Also: when we bring back runtime gc, locking
428  			 */
429  			gc->alive	= true;
430  			gc->sectors	= le16_to_cpu(new_s->sectors);
431  			gc->nr_blocks	= new_s->nr_blocks;
432  			gc->nr_redundant	= new_s->nr_redundant;
433  
434  			for (unsigned i = 0; i < new_s->nr_blocks; i++)
435  				gc->ptrs[i] = new_s->ptrs[i];
436  
437  			/*
438  			 * gc recalculates this field from stripe ptr
439  			 * references:
440  			 */
441  			memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
442  		}
443  
444  		if (new_s) {
445  			s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
446  
447  			struct disk_accounting_pos acc = {
448  				.type = BCH_DISK_ACCOUNTING_replicas,
449  			};
450  			bch2_bkey_to_replicas(&acc.replicas, new);
451  			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
452  			if (ret)
453  				return ret;
454  
455  			if (gc)
456  				memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
457  		}
458  
459  		if (old_s) {
460  			s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
461  
462  			struct disk_accounting_pos acc = {
463  				.type = BCH_DISK_ACCOUNTING_replicas,
464  			};
465  			bch2_bkey_to_replicas(&acc.replicas, old);
466  			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
467  			if (ret)
468  				return ret;
469  		}
470  
471  		int ret = mark_stripe_buckets(trans, old, new, flags);
472  		if (ret)
473  			return ret;
474  	}
475  
476  	if (flags & BTREE_TRIGGER_atomic) {
477  		struct stripe *m = genradix_ptr(&c->stripes, idx);
478  
479  		if (!m) {
480  			struct printbuf buf1 = PRINTBUF;
481  			struct printbuf buf2 = PRINTBUF;
482  
483  			bch2_bkey_val_to_text(&buf1, c, old);
484  			bch2_bkey_val_to_text(&buf2, c, new);
485  			bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
486  					    "old %s\n"
487  					    "new %s", idx, buf1.buf, buf2.buf);
488  			printbuf_exit(&buf2);
489  			printbuf_exit(&buf1);
490  			bch2_inconsistent_error(c);
491  			return -1;
492  		}
493  
494  		if (!new_s) {
495  			bch2_stripes_heap_del(c, m, idx);
496  
497  			memset(m, 0, sizeof(*m));
498  		} else {
499  			stripe_to_mem(m, new_s);
500  
501  			if (!old_s)
502  				bch2_stripes_heap_insert(c, m, idx);
503  			else
504  				bch2_stripes_heap_update(c, m, idx);
505  		}
506  	}
507  
508  	return 0;
509  }
510  
511  /* returns blocknr in stripe that we matched: */
bkey_matches_stripe(struct bch_stripe * s,struct bkey_s_c k,unsigned * block)512  static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
513  						struct bkey_s_c k, unsigned *block)
514  {
515  	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
516  	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
517  
518  	bkey_for_each_ptr(ptrs, ptr)
519  		for (i = 0; i < nr_data; i++)
520  			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
521  						      le16_to_cpu(s->sectors))) {
522  				*block = i;
523  				return ptr;
524  			}
525  
526  	return NULL;
527  }
528  
extent_has_stripe_ptr(struct bkey_s_c k,u64 idx)529  static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
530  {
531  	switch (k.k->type) {
532  	case KEY_TYPE_extent: {
533  		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
534  		const union bch_extent_entry *entry;
535  
536  		extent_for_each_entry(e, entry)
537  			if (extent_entry_type(entry) ==
538  			    BCH_EXTENT_ENTRY_stripe_ptr &&
539  			    entry->stripe_ptr.idx == idx)
540  				return true;
541  
542  		break;
543  	}
544  	}
545  
546  	return false;
547  }
548  
549  /* Stripe bufs: */
550  
ec_stripe_buf_exit(struct ec_stripe_buf * buf)551  static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
552  {
553  	if (buf->key.k.type == KEY_TYPE_stripe) {
554  		struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
555  		unsigned i;
556  
557  		for (i = 0; i < s->v.nr_blocks; i++) {
558  			kvfree(buf->data[i]);
559  			buf->data[i] = NULL;
560  		}
561  	}
562  }
563  
564  /* XXX: this is a non-mempoolified memory allocation: */
ec_stripe_buf_init(struct ec_stripe_buf * buf,unsigned offset,unsigned size)565  static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
566  			      unsigned offset, unsigned size)
567  {
568  	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
569  	unsigned csum_granularity = 1U << v->csum_granularity_bits;
570  	unsigned end = offset + size;
571  	unsigned i;
572  
573  	BUG_ON(end > le16_to_cpu(v->sectors));
574  
575  	offset	= round_down(offset, csum_granularity);
576  	end	= min_t(unsigned, le16_to_cpu(v->sectors),
577  			round_up(end, csum_granularity));
578  
579  	buf->offset	= offset;
580  	buf->size	= end - offset;
581  
582  	memset(buf->valid, 0xFF, sizeof(buf->valid));
583  
584  	for (i = 0; i < v->nr_blocks; i++) {
585  		buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
586  		if (!buf->data[i])
587  			goto err;
588  	}
589  
590  	return 0;
591  err:
592  	ec_stripe_buf_exit(buf);
593  	return -BCH_ERR_ENOMEM_stripe_buf;
594  }
595  
596  /* Checksumming: */
597  
ec_block_checksum(struct ec_stripe_buf * buf,unsigned block,unsigned offset)598  static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
599  					 unsigned block, unsigned offset)
600  {
601  	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
602  	unsigned csum_granularity = 1 << v->csum_granularity_bits;
603  	unsigned end = buf->offset + buf->size;
604  	unsigned len = min(csum_granularity, end - offset);
605  
606  	BUG_ON(offset >= end);
607  	BUG_ON(offset <  buf->offset);
608  	BUG_ON(offset & (csum_granularity - 1));
609  	BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
610  	       (len & (csum_granularity - 1)));
611  
612  	return bch2_checksum(NULL, v->csum_type,
613  			     null_nonce(),
614  			     buf->data[block] + ((offset - buf->offset) << 9),
615  			     len << 9);
616  }
617  
ec_generate_checksums(struct ec_stripe_buf * buf)618  static void ec_generate_checksums(struct ec_stripe_buf *buf)
619  {
620  	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
621  	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
622  
623  	if (!v->csum_type)
624  		return;
625  
626  	BUG_ON(buf->offset);
627  	BUG_ON(buf->size != le16_to_cpu(v->sectors));
628  
629  	for (i = 0; i < v->nr_blocks; i++)
630  		for (j = 0; j < csums_per_device; j++)
631  			stripe_csum_set(v, i, j,
632  				ec_block_checksum(buf, i, j << v->csum_granularity_bits));
633  }
634  
ec_validate_checksums(struct bch_fs * c,struct ec_stripe_buf * buf)635  static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
636  {
637  	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
638  	unsigned csum_granularity = 1 << v->csum_granularity_bits;
639  	unsigned i;
640  
641  	if (!v->csum_type)
642  		return;
643  
644  	for (i = 0; i < v->nr_blocks; i++) {
645  		unsigned offset = buf->offset;
646  		unsigned end = buf->offset + buf->size;
647  
648  		if (!test_bit(i, buf->valid))
649  			continue;
650  
651  		while (offset < end) {
652  			unsigned j = offset >> v->csum_granularity_bits;
653  			unsigned len = min(csum_granularity, end - offset);
654  			struct bch_csum want = stripe_csum_get(v, i, j);
655  			struct bch_csum got = ec_block_checksum(buf, i, offset);
656  
657  			if (bch2_crc_cmp(want, got)) {
658  				struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
659  				if (ca) {
660  					struct printbuf err = PRINTBUF;
661  
662  					prt_str(&err, "stripe ");
663  					bch2_csum_err_msg(&err, v->csum_type, want, got);
664  					prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
665  					bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
666  					bch_err_ratelimited(ca, "%s", err.buf);
667  					printbuf_exit(&err);
668  
669  					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
670  				}
671  
672  				clear_bit(i, buf->valid);
673  				break;
674  			}
675  
676  			offset += len;
677  		}
678  	}
679  }
680  
681  /* Erasure coding: */
682  
ec_generate_ec(struct ec_stripe_buf * buf)683  static void ec_generate_ec(struct ec_stripe_buf *buf)
684  {
685  	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
686  	unsigned nr_data = v->nr_blocks - v->nr_redundant;
687  	unsigned bytes = le16_to_cpu(v->sectors) << 9;
688  
689  	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
690  }
691  
ec_nr_failed(struct ec_stripe_buf * buf)692  static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
693  {
694  	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
695  
696  	return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
697  }
698  
ec_do_recov(struct bch_fs * c,struct ec_stripe_buf * buf)699  static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
700  {
701  	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
702  	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
703  	unsigned nr_data = v->nr_blocks - v->nr_redundant;
704  	unsigned bytes = buf->size << 9;
705  
706  	if (ec_nr_failed(buf) > v->nr_redundant) {
707  		bch_err_ratelimited(c,
708  			"error doing reconstruct read: unable to read enough blocks");
709  		return -1;
710  	}
711  
712  	for (i = 0; i < nr_data; i++)
713  		if (!test_bit(i, buf->valid))
714  			failed[nr_failed++] = i;
715  
716  	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
717  	return 0;
718  }
719  
720  /* IO: */
721  
ec_block_endio(struct bio * bio)722  static void ec_block_endio(struct bio *bio)
723  {
724  	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
725  	struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
726  	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
727  	struct bch_dev *ca = ec_bio->ca;
728  	struct closure *cl = bio->bi_private;
729  
730  	if (bch2_dev_io_err_on(bio->bi_status, ca,
731  			       bio_data_dir(bio)
732  			       ? BCH_MEMBER_ERROR_write
733  			       : BCH_MEMBER_ERROR_read,
734  			       "erasure coding %s error: %s",
735  			       bio_data_dir(bio) ? "write" : "read",
736  			       bch2_blk_status_to_str(bio->bi_status)))
737  		clear_bit(ec_bio->idx, ec_bio->buf->valid);
738  
739  	int stale = dev_ptr_stale(ca, ptr);
740  	if (stale) {
741  		bch_err_ratelimited(ca->fs,
742  				    "error %s stripe: stale/invalid pointer (%i) after io",
743  				    bio_data_dir(bio) == READ ? "reading from" : "writing to",
744  				    stale);
745  		clear_bit(ec_bio->idx, ec_bio->buf->valid);
746  	}
747  
748  	bio_put(&ec_bio->bio);
749  	percpu_ref_put(&ca->io_ref);
750  	closure_put(cl);
751  }
752  
ec_block_io(struct bch_fs * c,struct ec_stripe_buf * buf,blk_opf_t opf,unsigned idx,struct closure * cl)753  static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
754  			blk_opf_t opf, unsigned idx, struct closure *cl)
755  {
756  	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
757  	unsigned offset = 0, bytes = buf->size << 9;
758  	struct bch_extent_ptr *ptr = &v->ptrs[idx];
759  	enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
760  		? BCH_DATA_user
761  		: BCH_DATA_parity;
762  	int rw = op_is_write(opf);
763  
764  	struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
765  	if (!ca) {
766  		clear_bit(idx, buf->valid);
767  		return;
768  	}
769  
770  	int stale = dev_ptr_stale(ca, ptr);
771  	if (stale) {
772  		bch_err_ratelimited(c,
773  				    "error %s stripe: stale pointer (%i)",
774  				    rw == READ ? "reading from" : "writing to",
775  				    stale);
776  		clear_bit(idx, buf->valid);
777  		return;
778  	}
779  
780  
781  	this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
782  
783  	while (offset < bytes) {
784  		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
785  					   DIV_ROUND_UP(bytes, PAGE_SIZE));
786  		unsigned b = min_t(size_t, bytes - offset,
787  				   nr_iovecs << PAGE_SHIFT);
788  		struct ec_bio *ec_bio;
789  
790  		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
791  						       nr_iovecs,
792  						       opf,
793  						       GFP_KERNEL,
794  						       &c->ec_bioset),
795  				      struct ec_bio, bio);
796  
797  		ec_bio->ca			= ca;
798  		ec_bio->buf			= buf;
799  		ec_bio->idx			= idx;
800  
801  		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
802  		ec_bio->bio.bi_end_io		= ec_block_endio;
803  		ec_bio->bio.bi_private		= cl;
804  
805  		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
806  
807  		closure_get(cl);
808  		percpu_ref_get(&ca->io_ref);
809  
810  		submit_bio(&ec_bio->bio);
811  
812  		offset += b;
813  	}
814  
815  	percpu_ref_put(&ca->io_ref);
816  }
817  
get_stripe_key_trans(struct btree_trans * trans,u64 idx,struct ec_stripe_buf * stripe)818  static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
819  				struct ec_stripe_buf *stripe)
820  {
821  	struct btree_iter iter;
822  	struct bkey_s_c k;
823  	int ret;
824  
825  	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
826  			       POS(0, idx), BTREE_ITER_slots);
827  	ret = bkey_err(k);
828  	if (ret)
829  		goto err;
830  	if (k.k->type != KEY_TYPE_stripe) {
831  		ret = -ENOENT;
832  		goto err;
833  	}
834  	bkey_reassemble(&stripe->key, k);
835  err:
836  	bch2_trans_iter_exit(trans, &iter);
837  	return ret;
838  }
839  
840  /* recovery read path: */
bch2_ec_read_extent(struct btree_trans * trans,struct bch_read_bio * rbio,struct bkey_s_c orig_k)841  int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
842  			struct bkey_s_c orig_k)
843  {
844  	struct bch_fs *c = trans->c;
845  	struct ec_stripe_buf *buf = NULL;
846  	struct closure cl;
847  	struct bch_stripe *v;
848  	unsigned i, offset;
849  	const char *msg = NULL;
850  	struct printbuf msgbuf = PRINTBUF;
851  	int ret = 0;
852  
853  	closure_init_stack(&cl);
854  
855  	BUG_ON(!rbio->pick.has_ec);
856  
857  	buf = kzalloc(sizeof(*buf), GFP_NOFS);
858  	if (!buf)
859  		return -BCH_ERR_ENOMEM_ec_read_extent;
860  
861  	ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
862  	if (ret) {
863  		msg = "stripe not found";
864  		goto err;
865  	}
866  
867  	v = &bkey_i_to_stripe(&buf->key)->v;
868  
869  	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
870  		msg = "pointer doesn't match stripe";
871  		goto err;
872  	}
873  
874  	offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
875  	if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
876  		msg = "read is bigger than stripe";
877  		goto err;
878  	}
879  
880  	ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
881  	if (ret) {
882  		msg = "-ENOMEM";
883  		goto err;
884  	}
885  
886  	for (i = 0; i < v->nr_blocks; i++)
887  		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
888  
889  	closure_sync(&cl);
890  
891  	if (ec_nr_failed(buf) > v->nr_redundant) {
892  		msg = "unable to read enough blocks";
893  		goto err;
894  	}
895  
896  	ec_validate_checksums(c, buf);
897  
898  	ret = ec_do_recov(c, buf);
899  	if (ret)
900  		goto err;
901  
902  	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
903  		      buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
904  out:
905  	ec_stripe_buf_exit(buf);
906  	kfree(buf);
907  	return ret;
908  err:
909  	bch2_bkey_val_to_text(&msgbuf, c, orig_k);
910  	bch_err_ratelimited(c,
911  			    "error doing reconstruct read: %s\n  %s", msg, msgbuf.buf);
912  	printbuf_exit(&msgbuf);;
913  	ret = -BCH_ERR_stripe_reconstruct;
914  	goto out;
915  }
916  
917  /* stripe bucket accounting: */
918  
__ec_stripe_mem_alloc(struct bch_fs * c,size_t idx,gfp_t gfp)919  static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
920  {
921  	ec_stripes_heap n, *h = &c->ec_stripes_heap;
922  
923  	if (idx >= h->size) {
924  		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
925  			return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
926  
927  		mutex_lock(&c->ec_stripes_heap_lock);
928  		if (n.size > h->size) {
929  			memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
930  			n.nr = h->nr;
931  			swap(*h, n);
932  		}
933  		mutex_unlock(&c->ec_stripes_heap_lock);
934  
935  		free_heap(&n);
936  	}
937  
938  	if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
939  		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
940  
941  	if (c->gc_pos.phase != GC_PHASE_not_running &&
942  	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
943  		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
944  
945  	return 0;
946  }
947  
ec_stripe_mem_alloc(struct btree_trans * trans,struct btree_iter * iter)948  static int ec_stripe_mem_alloc(struct btree_trans *trans,
949  			       struct btree_iter *iter)
950  {
951  	return allocate_dropping_locks_errcode(trans,
952  			__ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
953  }
954  
955  /*
956   * Hash table of open stripes:
957   * Stripes that are being created or modified are kept in a hash table, so that
958   * stripe deletion can skip them.
959   */
960  
__bch2_stripe_is_open(struct bch_fs * c,u64 idx)961  static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
962  {
963  	unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
964  	struct ec_stripe_new *s;
965  
966  	hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
967  		if (s->idx == idx)
968  			return true;
969  	return false;
970  }
971  
bch2_stripe_is_open(struct bch_fs * c,u64 idx)972  static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
973  {
974  	bool ret = false;
975  
976  	spin_lock(&c->ec_stripes_new_lock);
977  	ret = __bch2_stripe_is_open(c, idx);
978  	spin_unlock(&c->ec_stripes_new_lock);
979  
980  	return ret;
981  }
982  
bch2_try_open_stripe(struct bch_fs * c,struct ec_stripe_new * s,u64 idx)983  static bool bch2_try_open_stripe(struct bch_fs *c,
984  				 struct ec_stripe_new *s,
985  				 u64 idx)
986  {
987  	bool ret;
988  
989  	spin_lock(&c->ec_stripes_new_lock);
990  	ret = !__bch2_stripe_is_open(c, idx);
991  	if (ret) {
992  		unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
993  
994  		s->idx = idx;
995  		hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
996  	}
997  	spin_unlock(&c->ec_stripes_new_lock);
998  
999  	return ret;
1000  }
1001  
bch2_stripe_close(struct bch_fs * c,struct ec_stripe_new * s)1002  static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
1003  {
1004  	BUG_ON(!s->idx);
1005  
1006  	spin_lock(&c->ec_stripes_new_lock);
1007  	hlist_del_init(&s->hash);
1008  	spin_unlock(&c->ec_stripes_new_lock);
1009  
1010  	s->idx = 0;
1011  }
1012  
1013  /* Heap of all existing stripes, ordered by blocks_nonempty */
1014  
stripe_idx_to_delete(struct bch_fs * c)1015  static u64 stripe_idx_to_delete(struct bch_fs *c)
1016  {
1017  	ec_stripes_heap *h = &c->ec_stripes_heap;
1018  
1019  	lockdep_assert_held(&c->ec_stripes_heap_lock);
1020  
1021  	if (h->nr &&
1022  	    h->data[0].blocks_nonempty == 0 &&
1023  	    !bch2_stripe_is_open(c, h->data[0].idx))
1024  		return h->data[0].idx;
1025  
1026  	return 0;
1027  }
1028  
ec_stripes_heap_set_backpointer(ec_stripes_heap * h,size_t i)1029  static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
1030  						   size_t i)
1031  {
1032  	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
1033  
1034  	genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
1035  }
1036  
ec_stripes_heap_cmp(const void * l,const void * r,void __always_unused * args)1037  static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
1038  {
1039  	struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
1040  	struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
1041  
1042  	return ((_l->blocks_nonempty > _r->blocks_nonempty) <
1043  		(_l->blocks_nonempty < _r->blocks_nonempty));
1044  }
1045  
ec_stripes_heap_swap(void * l,void * r,void * h)1046  static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
1047  {
1048  	struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
1049  	struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
1050  	ec_stripes_heap *_h = (ec_stripes_heap *)h;
1051  	size_t i = _l - _h->data;
1052  	size_t j = _r - _h->data;
1053  
1054  	swap(*_l, *_r);
1055  
1056  	ec_stripes_heap_set_backpointer(_h, i);
1057  	ec_stripes_heap_set_backpointer(_h, j);
1058  }
1059  
heap_verify_backpointer(struct bch_fs * c,size_t idx)1060  static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
1061  {
1062  	ec_stripes_heap *h = &c->ec_stripes_heap;
1063  	struct stripe *m = genradix_ptr(&c->stripes, idx);
1064  
1065  	BUG_ON(m->heap_idx >= h->nr);
1066  	BUG_ON(h->data[m->heap_idx].idx != idx);
1067  }
1068  
bch2_stripes_heap_del(struct bch_fs * c,struct stripe * m,size_t idx)1069  void bch2_stripes_heap_del(struct bch_fs *c,
1070  			   struct stripe *m, size_t idx)
1071  {
1072  	const struct min_heap_callbacks callbacks = {
1073  		.less = ec_stripes_heap_cmp,
1074  		.swp = ec_stripes_heap_swap,
1075  	};
1076  
1077  	mutex_lock(&c->ec_stripes_heap_lock);
1078  	heap_verify_backpointer(c, idx);
1079  
1080  	min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
1081  	mutex_unlock(&c->ec_stripes_heap_lock);
1082  }
1083  
bch2_stripes_heap_insert(struct bch_fs * c,struct stripe * m,size_t idx)1084  void bch2_stripes_heap_insert(struct bch_fs *c,
1085  			      struct stripe *m, size_t idx)
1086  {
1087  	const struct min_heap_callbacks callbacks = {
1088  		.less = ec_stripes_heap_cmp,
1089  		.swp = ec_stripes_heap_swap,
1090  	};
1091  
1092  	mutex_lock(&c->ec_stripes_heap_lock);
1093  	BUG_ON(min_heap_full(&c->ec_stripes_heap));
1094  
1095  	genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
1096  	min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
1097  			.idx = idx,
1098  			.blocks_nonempty = m->blocks_nonempty,
1099  		}),
1100  		&callbacks,
1101  		&c->ec_stripes_heap);
1102  
1103  	heap_verify_backpointer(c, idx);
1104  	mutex_unlock(&c->ec_stripes_heap_lock);
1105  }
1106  
bch2_stripes_heap_update(struct bch_fs * c,struct stripe * m,size_t idx)1107  void bch2_stripes_heap_update(struct bch_fs *c,
1108  			      struct stripe *m, size_t idx)
1109  {
1110  	const struct min_heap_callbacks callbacks = {
1111  		.less = ec_stripes_heap_cmp,
1112  		.swp = ec_stripes_heap_swap,
1113  	};
1114  	ec_stripes_heap *h = &c->ec_stripes_heap;
1115  	bool do_deletes;
1116  	size_t i;
1117  
1118  	mutex_lock(&c->ec_stripes_heap_lock);
1119  	heap_verify_backpointer(c, idx);
1120  
1121  	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
1122  
1123  	i = m->heap_idx;
1124  	min_heap_sift_up(h,	i, &callbacks, &c->ec_stripes_heap);
1125  	min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
1126  
1127  	heap_verify_backpointer(c, idx);
1128  
1129  	do_deletes = stripe_idx_to_delete(c) != 0;
1130  	mutex_unlock(&c->ec_stripes_heap_lock);
1131  
1132  	if (do_deletes)
1133  		bch2_do_stripe_deletes(c);
1134  }
1135  
1136  /* stripe deletion */
1137  
ec_stripe_delete(struct btree_trans * trans,u64 idx)1138  static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
1139  {
1140  	struct bch_fs *c = trans->c;
1141  	struct btree_iter iter;
1142  	struct bkey_s_c k;
1143  	struct bkey_s_c_stripe s;
1144  	int ret;
1145  
1146  	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
1147  			       BTREE_ITER_intent);
1148  	ret = bkey_err(k);
1149  	if (ret)
1150  		goto err;
1151  
1152  	if (k.k->type != KEY_TYPE_stripe) {
1153  		bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
1154  		ret = -EINVAL;
1155  		goto err;
1156  	}
1157  
1158  	s = bkey_s_c_to_stripe(k);
1159  	for (unsigned i = 0; i < s.v->nr_blocks; i++)
1160  		if (stripe_blockcount_get(s.v, i)) {
1161  			struct printbuf buf = PRINTBUF;
1162  
1163  			bch2_bkey_val_to_text(&buf, c, k);
1164  			bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
1165  			printbuf_exit(&buf);
1166  			ret = -EINVAL;
1167  			goto err;
1168  		}
1169  
1170  	ret = bch2_btree_delete_at(trans, &iter, 0);
1171  err:
1172  	bch2_trans_iter_exit(trans, &iter);
1173  	return ret;
1174  }
1175  
ec_stripe_delete_work(struct work_struct * work)1176  static void ec_stripe_delete_work(struct work_struct *work)
1177  {
1178  	struct bch_fs *c =
1179  		container_of(work, struct bch_fs, ec_stripe_delete_work);
1180  
1181  	while (1) {
1182  		mutex_lock(&c->ec_stripes_heap_lock);
1183  		u64 idx = stripe_idx_to_delete(c);
1184  		mutex_unlock(&c->ec_stripes_heap_lock);
1185  
1186  		if (!idx)
1187  			break;
1188  
1189  		int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1190  					ec_stripe_delete(trans, idx));
1191  		bch_err_fn(c, ret);
1192  		if (ret)
1193  			break;
1194  	}
1195  
1196  	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1197  }
1198  
bch2_do_stripe_deletes(struct bch_fs * c)1199  void bch2_do_stripe_deletes(struct bch_fs *c)
1200  {
1201  	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
1202  	    !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
1203  		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1204  }
1205  
1206  /* stripe creation: */
1207  
ec_stripe_key_update(struct btree_trans * trans,struct bkey_i_stripe * old,struct bkey_i_stripe * new)1208  static int ec_stripe_key_update(struct btree_trans *trans,
1209  				struct bkey_i_stripe *old,
1210  				struct bkey_i_stripe *new)
1211  {
1212  	struct bch_fs *c = trans->c;
1213  	bool create = !old;
1214  
1215  	struct btree_iter iter;
1216  	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
1217  					       new->k.p, BTREE_ITER_intent);
1218  	int ret = bkey_err(k);
1219  	if (ret)
1220  		goto err;
1221  
1222  	if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
1223  				    c, "error %s stripe: got existing key type %s",
1224  				    create ? "creating" : "updating",
1225  				    bch2_bkey_types[k.k->type])) {
1226  		ret = -EINVAL;
1227  		goto err;
1228  	}
1229  
1230  	if (k.k->type == KEY_TYPE_stripe) {
1231  		const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
1232  
1233  		BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
1234  		BUG_ON(old->v.nr_blocks != v->nr_blocks);
1235  
1236  		for (unsigned i = 0; i < new->v.nr_blocks; i++) {
1237  			unsigned sectors = stripe_blockcount_get(v, i);
1238  
1239  			if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
1240  				struct printbuf buf = PRINTBUF;
1241  
1242  				prt_printf(&buf, "stripe changed nonempty block %u", i);
1243  				prt_str(&buf, "\nold: ");
1244  				bch2_bkey_val_to_text(&buf, c, k);
1245  				prt_str(&buf, "\nnew: ");
1246  				bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
1247  				bch2_fs_inconsistent(c, "%s", buf.buf);
1248  				printbuf_exit(&buf);
1249  				ret = -EINVAL;
1250  				goto err;
1251  			}
1252  
1253  			/*
1254  			 * If the stripe ptr changed underneath us, it must have
1255  			 * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
1256  			 */
1257  			if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
1258  				BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
1259  
1260  				if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
1261  					new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
1262  			}
1263  
1264  			stripe_blockcount_set(&new->v, i, sectors);
1265  		}
1266  	}
1267  
1268  	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
1269  err:
1270  	bch2_trans_iter_exit(trans, &iter);
1271  	return ret;
1272  }
1273  
ec_stripe_update_extent(struct btree_trans * trans,struct bch_dev * ca,struct bpos bucket,u8 gen,struct ec_stripe_buf * s,struct bpos * bp_pos)1274  static int ec_stripe_update_extent(struct btree_trans *trans,
1275  				   struct bch_dev *ca,
1276  				   struct bpos bucket, u8 gen,
1277  				   struct ec_stripe_buf *s,
1278  				   struct bpos *bp_pos)
1279  {
1280  	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1281  	struct bch_fs *c = trans->c;
1282  	struct bch_backpointer bp;
1283  	struct btree_iter iter;
1284  	struct bkey_s_c k;
1285  	const struct bch_extent_ptr *ptr_c;
1286  	struct bch_extent_ptr *ec_ptr = NULL;
1287  	struct bch_extent_stripe_ptr stripe_ptr;
1288  	struct bkey_i *n;
1289  	int ret, dev, block;
1290  
1291  	ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
1292  				bp_pos, &bp, BTREE_ITER_cached);
1293  	if (ret)
1294  		return ret;
1295  	if (bpos_eq(*bp_pos, SPOS_MAX))
1296  		return 0;
1297  
1298  	if (bp.level) {
1299  		struct printbuf buf = PRINTBUF;
1300  		struct btree_iter node_iter;
1301  		struct btree *b;
1302  
1303  		b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
1304  		bch2_trans_iter_exit(trans, &node_iter);
1305  
1306  		if (!b)
1307  			return 0;
1308  
1309  		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
1310  		bch2_backpointer_to_text(&buf, &bp);
1311  
1312  		bch2_fs_inconsistent(c, "%s", buf.buf);
1313  		printbuf_exit(&buf);
1314  		return -EIO;
1315  	}
1316  
1317  	k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent);
1318  	ret = bkey_err(k);
1319  	if (ret)
1320  		return ret;
1321  	if (!k.k) {
1322  		/*
1323  		 * extent no longer exists - we could flush the btree
1324  		 * write buffer and retry to verify, but no need:
1325  		 */
1326  		return 0;
1327  	}
1328  
1329  	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
1330  		goto out;
1331  
1332  	ptr_c = bkey_matches_stripe(v, k, &block);
1333  	/*
1334  	 * It doesn't generally make sense to erasure code cached ptrs:
1335  	 * XXX: should we be incrementing a counter?
1336  	 */
1337  	if (!ptr_c || ptr_c->cached)
1338  		goto out;
1339  
1340  	dev = v->ptrs[block].dev;
1341  
1342  	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
1343  	ret = PTR_ERR_OR_ZERO(n);
1344  	if (ret)
1345  		goto out;
1346  
1347  	bkey_reassemble(n, k);
1348  
1349  	bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
1350  	ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
1351  	BUG_ON(!ec_ptr);
1352  
1353  	stripe_ptr = (struct bch_extent_stripe_ptr) {
1354  		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
1355  		.block		= block,
1356  		.redundancy	= v->nr_redundant,
1357  		.idx		= s->key.k.p.offset,
1358  	};
1359  
1360  	__extent_entry_insert(n,
1361  			(union bch_extent_entry *) ec_ptr,
1362  			(union bch_extent_entry *) &stripe_ptr);
1363  
1364  	ret = bch2_trans_update(trans, &iter, n, 0);
1365  out:
1366  	bch2_trans_iter_exit(trans, &iter);
1367  	return ret;
1368  }
1369  
ec_stripe_update_bucket(struct btree_trans * trans,struct ec_stripe_buf * s,unsigned block)1370  static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
1371  				   unsigned block)
1372  {
1373  	struct bch_fs *c = trans->c;
1374  	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1375  	struct bch_extent_ptr ptr = v->ptrs[block];
1376  	struct bpos bp_pos = POS_MIN;
1377  	int ret = 0;
1378  
1379  	struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
1380  	if (!ca)
1381  		return -EIO;
1382  
1383  	struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
1384  
1385  	while (1) {
1386  		ret = commit_do(trans, NULL, NULL,
1387  				BCH_TRANS_COMMIT_no_check_rw|
1388  				BCH_TRANS_COMMIT_no_enospc,
1389  			ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos));
1390  		if (ret)
1391  			break;
1392  		if (bkey_eq(bp_pos, POS_MAX))
1393  			break;
1394  
1395  		bp_pos = bpos_nosnap_successor(bp_pos);
1396  	}
1397  
1398  	bch2_dev_put(ca);
1399  	return ret;
1400  }
1401  
ec_stripe_update_extents(struct bch_fs * c,struct ec_stripe_buf * s)1402  static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
1403  {
1404  	struct btree_trans *trans = bch2_trans_get(c);
1405  	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1406  	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1407  	int ret = 0;
1408  
1409  	ret = bch2_btree_write_buffer_flush_sync(trans);
1410  	if (ret)
1411  		goto err;
1412  
1413  	for (i = 0; i < nr_data; i++) {
1414  		ret = ec_stripe_update_bucket(trans, s, i);
1415  		if (ret)
1416  			break;
1417  	}
1418  err:
1419  	bch2_trans_put(trans);
1420  
1421  	return ret;
1422  }
1423  
zero_out_rest_of_ec_bucket(struct bch_fs * c,struct ec_stripe_new * s,unsigned block,struct open_bucket * ob)1424  static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
1425  				       struct ec_stripe_new *s,
1426  				       unsigned block,
1427  				       struct open_bucket *ob)
1428  {
1429  	struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
1430  	if (!ca) {
1431  		s->err = -BCH_ERR_erofs_no_writes;
1432  		return;
1433  	}
1434  
1435  	unsigned offset = ca->mi.bucket_size - ob->sectors_free;
1436  	memset(s->new_stripe.data[block] + (offset << 9),
1437  	       0,
1438  	       ob->sectors_free << 9);
1439  
1440  	int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
1441  			ob->bucket * ca->mi.bucket_size + offset,
1442  			ob->sectors_free,
1443  			GFP_KERNEL, 0);
1444  
1445  	percpu_ref_put(&ca->io_ref);
1446  
1447  	if (ret)
1448  		s->err = ret;
1449  }
1450  
bch2_ec_stripe_new_free(struct bch_fs * c,struct ec_stripe_new * s)1451  void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
1452  {
1453  	if (s->idx)
1454  		bch2_stripe_close(c, s);
1455  	kfree(s);
1456  }
1457  
1458  /*
1459   * data buckets of new stripe all written: create the stripe
1460   */
ec_stripe_create(struct ec_stripe_new * s)1461  static void ec_stripe_create(struct ec_stripe_new *s)
1462  {
1463  	struct bch_fs *c = s->c;
1464  	struct open_bucket *ob;
1465  	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
1466  	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1467  	int ret;
1468  
1469  	BUG_ON(s->h->s == s);
1470  
1471  	closure_sync(&s->iodone);
1472  
1473  	if (!s->err) {
1474  		for (i = 0; i < nr_data; i++)
1475  			if (s->blocks[i]) {
1476  				ob = c->open_buckets + s->blocks[i];
1477  
1478  				if (ob->sectors_free)
1479  					zero_out_rest_of_ec_bucket(c, s, i, ob);
1480  			}
1481  	}
1482  
1483  	if (s->err) {
1484  		if (!bch2_err_matches(s->err, EROFS))
1485  			bch_err(c, "error creating stripe: error writing data buckets");
1486  		goto err;
1487  	}
1488  
1489  	if (s->have_existing_stripe) {
1490  		ec_validate_checksums(c, &s->existing_stripe);
1491  
1492  		if (ec_do_recov(c, &s->existing_stripe)) {
1493  			bch_err(c, "error creating stripe: error reading existing stripe");
1494  			goto err;
1495  		}
1496  
1497  		for (i = 0; i < nr_data; i++)
1498  			if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
1499  				swap(s->new_stripe.data[i],
1500  				     s->existing_stripe.data[i]);
1501  
1502  		ec_stripe_buf_exit(&s->existing_stripe);
1503  	}
1504  
1505  	BUG_ON(!s->allocated);
1506  	BUG_ON(!s->idx);
1507  
1508  	ec_generate_ec(&s->new_stripe);
1509  
1510  	ec_generate_checksums(&s->new_stripe);
1511  
1512  	/* write p/q: */
1513  	for (i = nr_data; i < v->nr_blocks; i++)
1514  		ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
1515  	closure_sync(&s->iodone);
1516  
1517  	if (ec_nr_failed(&s->new_stripe)) {
1518  		bch_err(c, "error creating stripe: error writing redundancy buckets");
1519  		goto err;
1520  	}
1521  
1522  	ret = bch2_trans_commit_do(c, &s->res, NULL,
1523  		BCH_TRANS_COMMIT_no_check_rw|
1524  		BCH_TRANS_COMMIT_no_enospc,
1525  		ec_stripe_key_update(trans,
1526  				     s->have_existing_stripe
1527  				     ? bkey_i_to_stripe(&s->existing_stripe.key)
1528  				     : NULL,
1529  				     bkey_i_to_stripe(&s->new_stripe.key)));
1530  	bch_err_msg(c, ret, "creating stripe key");
1531  	if (ret) {
1532  		goto err;
1533  	}
1534  
1535  	ret = ec_stripe_update_extents(c, &s->new_stripe);
1536  	bch_err_msg(c, ret, "error updating extents");
1537  	if (ret)
1538  		goto err;
1539  err:
1540  	bch2_disk_reservation_put(c, &s->res);
1541  
1542  	for (i = 0; i < v->nr_blocks; i++)
1543  		if (s->blocks[i]) {
1544  			ob = c->open_buckets + s->blocks[i];
1545  
1546  			if (i < nr_data) {
1547  				ob->ec = NULL;
1548  				__bch2_open_bucket_put(c, ob);
1549  			} else {
1550  				bch2_open_bucket_put(c, ob);
1551  			}
1552  		}
1553  
1554  	mutex_lock(&c->ec_stripe_new_lock);
1555  	list_del(&s->list);
1556  	mutex_unlock(&c->ec_stripe_new_lock);
1557  	wake_up(&c->ec_stripe_new_wait);
1558  
1559  	ec_stripe_buf_exit(&s->existing_stripe);
1560  	ec_stripe_buf_exit(&s->new_stripe);
1561  	closure_debug_destroy(&s->iodone);
1562  
1563  	ec_stripe_new_put(c, s, STRIPE_REF_stripe);
1564  }
1565  
get_pending_stripe(struct bch_fs * c)1566  static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
1567  {
1568  	struct ec_stripe_new *s;
1569  
1570  	mutex_lock(&c->ec_stripe_new_lock);
1571  	list_for_each_entry(s, &c->ec_stripe_new_list, list)
1572  		if (!atomic_read(&s->ref[STRIPE_REF_io]))
1573  			goto out;
1574  	s = NULL;
1575  out:
1576  	mutex_unlock(&c->ec_stripe_new_lock);
1577  
1578  	return s;
1579  }
1580  
ec_stripe_create_work(struct work_struct * work)1581  static void ec_stripe_create_work(struct work_struct *work)
1582  {
1583  	struct bch_fs *c = container_of(work,
1584  		struct bch_fs, ec_stripe_create_work);
1585  	struct ec_stripe_new *s;
1586  
1587  	while ((s = get_pending_stripe(c)))
1588  		ec_stripe_create(s);
1589  
1590  	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1591  }
1592  
bch2_ec_do_stripe_creates(struct bch_fs * c)1593  void bch2_ec_do_stripe_creates(struct bch_fs *c)
1594  {
1595  	bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
1596  
1597  	if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
1598  		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1599  }
1600  
ec_stripe_new_set_pending(struct bch_fs * c,struct ec_stripe_head * h)1601  static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
1602  {
1603  	struct ec_stripe_new *s = h->s;
1604  
1605  	lockdep_assert_held(&h->lock);
1606  
1607  	BUG_ON(!s->allocated && !s->err);
1608  
1609  	h->s		= NULL;
1610  	s->pending	= true;
1611  
1612  	mutex_lock(&c->ec_stripe_new_lock);
1613  	list_add(&s->list, &c->ec_stripe_new_list);
1614  	mutex_unlock(&c->ec_stripe_new_lock);
1615  
1616  	ec_stripe_new_put(c, s, STRIPE_REF_io);
1617  }
1618  
ec_stripe_new_cancel(struct bch_fs * c,struct ec_stripe_head * h,int err)1619  static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
1620  {
1621  	h->s->err = err;
1622  	ec_stripe_new_set_pending(c, h);
1623  }
1624  
bch2_ec_bucket_cancel(struct bch_fs * c,struct open_bucket * ob)1625  void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
1626  {
1627  	struct ec_stripe_new *s = ob->ec;
1628  
1629  	s->err = -EIO;
1630  }
1631  
bch2_writepoint_ec_buf(struct bch_fs * c,struct write_point * wp)1632  void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
1633  {
1634  	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
1635  	if (!ob)
1636  		return NULL;
1637  
1638  	BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
1639  
1640  	struct bch_dev *ca	= ob_dev(c, ob);
1641  	unsigned offset		= ca->mi.bucket_size - ob->sectors_free;
1642  
1643  	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
1644  }
1645  
unsigned_cmp(const void * _l,const void * _r)1646  static int unsigned_cmp(const void *_l, const void *_r)
1647  {
1648  	unsigned l = *((const unsigned *) _l);
1649  	unsigned r = *((const unsigned *) _r);
1650  
1651  	return cmp_int(l, r);
1652  }
1653  
1654  /* pick most common bucket size: */
pick_blocksize(struct bch_fs * c,struct bch_devs_mask * devs)1655  static unsigned pick_blocksize(struct bch_fs *c,
1656  			       struct bch_devs_mask *devs)
1657  {
1658  	unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1659  	struct {
1660  		unsigned nr, size;
1661  	} cur = { 0, 0 }, best = { 0, 0 };
1662  
1663  	for_each_member_device_rcu(c, ca, devs)
1664  		sizes[nr++] = ca->mi.bucket_size;
1665  
1666  	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1667  
1668  	for (unsigned i = 0; i < nr; i++) {
1669  		if (sizes[i] != cur.size) {
1670  			if (cur.nr > best.nr)
1671  				best = cur;
1672  
1673  			cur.nr = 0;
1674  			cur.size = sizes[i];
1675  		}
1676  
1677  		cur.nr++;
1678  	}
1679  
1680  	if (cur.nr > best.nr)
1681  		best = cur;
1682  
1683  	return best.size;
1684  }
1685  
may_create_new_stripe(struct bch_fs * c)1686  static bool may_create_new_stripe(struct bch_fs *c)
1687  {
1688  	return false;
1689  }
1690  
ec_stripe_key_init(struct bch_fs * c,struct bkey_i * k,unsigned nr_data,unsigned nr_parity,unsigned stripe_size,unsigned disk_label)1691  static void ec_stripe_key_init(struct bch_fs *c,
1692  			       struct bkey_i *k,
1693  			       unsigned nr_data,
1694  			       unsigned nr_parity,
1695  			       unsigned stripe_size,
1696  			       unsigned disk_label)
1697  {
1698  	struct bkey_i_stripe *s = bkey_stripe_init(k);
1699  	unsigned u64s;
1700  
1701  	s->v.sectors			= cpu_to_le16(stripe_size);
1702  	s->v.algorithm			= 0;
1703  	s->v.nr_blocks			= nr_data + nr_parity;
1704  	s->v.nr_redundant		= nr_parity;
1705  	s->v.csum_granularity_bits	= ilog2(c->opts.encoded_extent_max >> 9);
1706  	s->v.csum_type			= BCH_CSUM_crc32c;
1707  	s->v.disk_label			= disk_label;
1708  
1709  	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
1710  		BUG_ON(1 << s->v.csum_granularity_bits >=
1711  		       le16_to_cpu(s->v.sectors) ||
1712  		       s->v.csum_granularity_bits == U8_MAX);
1713  		s->v.csum_granularity_bits++;
1714  	}
1715  
1716  	set_bkey_val_u64s(&s->k, u64s);
1717  }
1718  
ec_new_stripe_alloc(struct bch_fs * c,struct ec_stripe_head * h)1719  static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1720  {
1721  	struct ec_stripe_new *s;
1722  
1723  	lockdep_assert_held(&h->lock);
1724  
1725  	s = kzalloc(sizeof(*s), GFP_KERNEL);
1726  	if (!s)
1727  		return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
1728  
1729  	mutex_init(&s->lock);
1730  	closure_init(&s->iodone, NULL);
1731  	atomic_set(&s->ref[STRIPE_REF_stripe], 1);
1732  	atomic_set(&s->ref[STRIPE_REF_io], 1);
1733  	s->c		= c;
1734  	s->h		= h;
1735  	s->nr_data	= min_t(unsigned, h->nr_active_devs,
1736  				BCH_BKEY_PTRS_MAX) - h->redundancy;
1737  	s->nr_parity	= h->redundancy;
1738  
1739  	ec_stripe_key_init(c, &s->new_stripe.key,
1740  			   s->nr_data, s->nr_parity,
1741  			   h->blocksize, h->disk_label);
1742  
1743  	h->s = s;
1744  	h->nr_created++;
1745  	return 0;
1746  }
1747  
ec_stripe_head_devs_update(struct bch_fs * c,struct ec_stripe_head * h)1748  static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
1749  {
1750  	struct bch_devs_mask devs = h->devs;
1751  
1752  	rcu_read_lock();
1753  	h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
1754  				 ? group_to_target(h->disk_label - 1)
1755  				 : 0);
1756  	unsigned nr_devs = dev_mask_nr(&h->devs);
1757  
1758  	for_each_member_device_rcu(c, ca, &h->devs)
1759  		if (!ca->mi.durability)
1760  			__clear_bit(ca->dev_idx, h->devs.d);
1761  	unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
1762  
1763  	h->blocksize = pick_blocksize(c, &h->devs);
1764  
1765  	h->nr_active_devs = 0;
1766  	for_each_member_device_rcu(c, ca, &h->devs)
1767  		if (ca->mi.bucket_size == h->blocksize)
1768  			h->nr_active_devs++;
1769  
1770  	rcu_read_unlock();
1771  
1772  	/*
1773  	 * If we only have redundancy + 1 devices, we're better off with just
1774  	 * replication:
1775  	 */
1776  	h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
1777  
1778  	if (h->insufficient_devs) {
1779  		const char *err;
1780  
1781  		if (nr_devs < h->redundancy + 2)
1782  			err = NULL;
1783  		else if (nr_devs_with_durability < h->redundancy + 2)
1784  			err = "cannot use durability=0 devices";
1785  		else
1786  			err = "mismatched bucket sizes";
1787  
1788  		if (err)
1789  			bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
1790  				h->nr_active_devs, h->redundancy + 2, err);
1791  	}
1792  
1793  	struct bch_devs_mask devs_leaving;
1794  	bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX);
1795  
1796  	if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving))
1797  		ec_stripe_new_cancel(c, h, -EINTR);
1798  
1799  	h->rw_devs_change_count = c->rw_devs_change_count;
1800  }
1801  
1802  static struct ec_stripe_head *
ec_new_stripe_head_alloc(struct bch_fs * c,unsigned disk_label,unsigned algo,unsigned redundancy,enum bch_watermark watermark)1803  ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
1804  			 unsigned algo, unsigned redundancy,
1805  			 enum bch_watermark watermark)
1806  {
1807  	struct ec_stripe_head *h;
1808  
1809  	h = kzalloc(sizeof(*h), GFP_KERNEL);
1810  	if (!h)
1811  		return NULL;
1812  
1813  	mutex_init(&h->lock);
1814  	BUG_ON(!mutex_trylock(&h->lock));
1815  
1816  	h->disk_label	= disk_label;
1817  	h->algo		= algo;
1818  	h->redundancy	= redundancy;
1819  	h->watermark	= watermark;
1820  
1821  	list_add(&h->list, &c->ec_stripe_head_list);
1822  	return h;
1823  }
1824  
bch2_ec_stripe_head_put(struct bch_fs * c,struct ec_stripe_head * h)1825  void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
1826  {
1827  	if (h->s &&
1828  	    h->s->allocated &&
1829  	    bitmap_weight(h->s->blocks_allocated,
1830  			  h->s->nr_data) == h->s->nr_data)
1831  		ec_stripe_new_set_pending(c, h);
1832  
1833  	mutex_unlock(&h->lock);
1834  }
1835  
1836  static struct ec_stripe_head *
__bch2_ec_stripe_head_get(struct btree_trans * trans,unsigned disk_label,unsigned algo,unsigned redundancy,enum bch_watermark watermark)1837  __bch2_ec_stripe_head_get(struct btree_trans *trans,
1838  			  unsigned disk_label,
1839  			  unsigned algo,
1840  			  unsigned redundancy,
1841  			  enum bch_watermark watermark)
1842  {
1843  	struct bch_fs *c = trans->c;
1844  	struct ec_stripe_head *h;
1845  	int ret;
1846  
1847  	if (!redundancy)
1848  		return NULL;
1849  
1850  	ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
1851  	if (ret)
1852  		return ERR_PTR(ret);
1853  
1854  	if (test_bit(BCH_FS_going_ro, &c->flags)) {
1855  		h = ERR_PTR(-BCH_ERR_erofs_no_writes);
1856  		goto err;
1857  	}
1858  
1859  	list_for_each_entry(h, &c->ec_stripe_head_list, list)
1860  		if (h->disk_label	== disk_label &&
1861  		    h->algo		== algo &&
1862  		    h->redundancy	== redundancy &&
1863  		    h->watermark	== watermark) {
1864  			ret = bch2_trans_mutex_lock(trans, &h->lock);
1865  			if (ret) {
1866  				h = ERR_PTR(ret);
1867  				goto err;
1868  			}
1869  			goto found;
1870  		}
1871  
1872  	h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
1873  	if (!h) {
1874  		h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
1875  		goto err;
1876  	}
1877  found:
1878  	if (h->rw_devs_change_count != c->rw_devs_change_count)
1879  		ec_stripe_head_devs_update(c, h);
1880  
1881  	if (h->insufficient_devs) {
1882  		mutex_unlock(&h->lock);
1883  		h = NULL;
1884  	}
1885  err:
1886  	mutex_unlock(&c->ec_stripe_head_lock);
1887  	return h;
1888  }
1889  
new_stripe_alloc_buckets(struct btree_trans * trans,struct ec_stripe_head * h,enum bch_watermark watermark,struct closure * cl)1890  static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
1891  				    enum bch_watermark watermark, struct closure *cl)
1892  {
1893  	struct bch_fs *c = trans->c;
1894  	struct bch_devs_mask devs = h->devs;
1895  	struct open_bucket *ob;
1896  	struct open_buckets buckets;
1897  	struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1898  	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
1899  	bool have_cache = true;
1900  	int ret = 0;
1901  
1902  	BUG_ON(v->nr_blocks	!= h->s->nr_data + h->s->nr_parity);
1903  	BUG_ON(v->nr_redundant	!= h->s->nr_parity);
1904  
1905  	/* * We bypass the sector allocator which normally does this: */
1906  	bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
1907  
1908  	for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
1909  		/*
1910  		 * Note: we don't yet repair invalid blocks (failed/removed
1911  		 * devices) when reusing stripes - we still need a codepath to
1912  		 * walk backpointers and update all extents that point to that
1913  		 * block when updating the stripe
1914  		 */
1915  		if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
1916  			__clear_bit(v->ptrs[i].dev, devs.d);
1917  
1918  		if (i < h->s->nr_data)
1919  			nr_have_data++;
1920  		else
1921  			nr_have_parity++;
1922  	}
1923  
1924  	BUG_ON(nr_have_data	> h->s->nr_data);
1925  	BUG_ON(nr_have_parity	> h->s->nr_parity);
1926  
1927  	buckets.nr = 0;
1928  	if (nr_have_parity < h->s->nr_parity) {
1929  		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1930  					    &h->parity_stripe,
1931  					    &devs,
1932  					    h->s->nr_parity,
1933  					    &nr_have_parity,
1934  					    &have_cache, 0,
1935  					    BCH_DATA_parity,
1936  					    watermark,
1937  					    cl);
1938  
1939  		open_bucket_for_each(c, &buckets, ob, i) {
1940  			j = find_next_zero_bit(h->s->blocks_gotten,
1941  					       h->s->nr_data + h->s->nr_parity,
1942  					       h->s->nr_data);
1943  			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
1944  
1945  			h->s->blocks[j] = buckets.v[i];
1946  			v->ptrs[j] = bch2_ob_ptr(c, ob);
1947  			__set_bit(j, h->s->blocks_gotten);
1948  		}
1949  
1950  		if (ret)
1951  			return ret;
1952  	}
1953  
1954  	buckets.nr = 0;
1955  	if (nr_have_data < h->s->nr_data) {
1956  		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1957  					    &h->block_stripe,
1958  					    &devs,
1959  					    h->s->nr_data,
1960  					    &nr_have_data,
1961  					    &have_cache, 0,
1962  					    BCH_DATA_user,
1963  					    watermark,
1964  					    cl);
1965  
1966  		open_bucket_for_each(c, &buckets, ob, i) {
1967  			j = find_next_zero_bit(h->s->blocks_gotten,
1968  					       h->s->nr_data, 0);
1969  			BUG_ON(j >= h->s->nr_data);
1970  
1971  			h->s->blocks[j] = buckets.v[i];
1972  			v->ptrs[j] = bch2_ob_ptr(c, ob);
1973  			__set_bit(j, h->s->blocks_gotten);
1974  		}
1975  
1976  		if (ret)
1977  			return ret;
1978  	}
1979  
1980  	return 0;
1981  }
1982  
get_existing_stripe(struct bch_fs * c,struct ec_stripe_head * head)1983  static s64 get_existing_stripe(struct bch_fs *c,
1984  			       struct ec_stripe_head *head)
1985  {
1986  	ec_stripes_heap *h = &c->ec_stripes_heap;
1987  	struct stripe *m;
1988  	size_t heap_idx;
1989  	u64 stripe_idx;
1990  	s64 ret = -1;
1991  
1992  	if (may_create_new_stripe(c))
1993  		return -1;
1994  
1995  	mutex_lock(&c->ec_stripes_heap_lock);
1996  	for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
1997  		/* No blocks worth reusing, stripe will just be deleted: */
1998  		if (!h->data[heap_idx].blocks_nonempty)
1999  			continue;
2000  
2001  		stripe_idx = h->data[heap_idx].idx;
2002  
2003  		m = genradix_ptr(&c->stripes, stripe_idx);
2004  
2005  		if (m->disk_label	== head->disk_label &&
2006  		    m->algorithm	== head->algo &&
2007  		    m->nr_redundant	== head->redundancy &&
2008  		    m->sectors		== head->blocksize &&
2009  		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant &&
2010  		    bch2_try_open_stripe(c, head->s, stripe_idx)) {
2011  			ret = stripe_idx;
2012  			break;
2013  		}
2014  	}
2015  	mutex_unlock(&c->ec_stripes_heap_lock);
2016  	return ret;
2017  }
2018  
__bch2_ec_stripe_head_reuse(struct btree_trans * trans,struct ec_stripe_head * h)2019  static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
2020  {
2021  	struct bch_fs *c = trans->c;
2022  	struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
2023  	struct bch_stripe *existing_v;
2024  	unsigned i;
2025  	s64 idx;
2026  	int ret;
2027  
2028  	/*
2029  	 * If we can't allocate a new stripe, and there's no stripes with empty
2030  	 * blocks for us to reuse, that means we have to wait on copygc:
2031  	 */
2032  	idx = get_existing_stripe(c, h);
2033  	if (idx < 0)
2034  		return -BCH_ERR_stripe_alloc_blocked;
2035  
2036  	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
2037  	bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
2038  			     "reading stripe key: %s", bch2_err_str(ret));
2039  	if (ret) {
2040  		bch2_stripe_close(c, h->s);
2041  		return ret;
2042  	}
2043  
2044  	existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
2045  
2046  	BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
2047  	h->s->nr_data = existing_v->nr_blocks -
2048  		existing_v->nr_redundant;
2049  
2050  	ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
2051  	if (ret) {
2052  		bch2_stripe_close(c, h->s);
2053  		return ret;
2054  	}
2055  
2056  	BUG_ON(h->s->existing_stripe.size != h->blocksize);
2057  	BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
2058  
2059  	/*
2060  	 * Free buckets we initially allocated - they might conflict with
2061  	 * blocks from the stripe we're reusing:
2062  	 */
2063  	for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
2064  		bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
2065  		h->s->blocks[i] = 0;
2066  	}
2067  	memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
2068  	memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
2069  
2070  	for (i = 0; i < existing_v->nr_blocks; i++) {
2071  		if (stripe_blockcount_get(existing_v, i)) {
2072  			__set_bit(i, h->s->blocks_gotten);
2073  			__set_bit(i, h->s->blocks_allocated);
2074  		}
2075  
2076  		ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
2077  	}
2078  
2079  	bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
2080  	h->s->have_existing_stripe = true;
2081  
2082  	return 0;
2083  }
2084  
__bch2_ec_stripe_head_reserve(struct btree_trans * trans,struct ec_stripe_head * h)2085  static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
2086  {
2087  	struct bch_fs *c = trans->c;
2088  	struct btree_iter iter;
2089  	struct bkey_s_c k;
2090  	struct bpos min_pos = POS(0, 1);
2091  	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
2092  	int ret;
2093  
2094  	if (!h->s->res.sectors) {
2095  		ret = bch2_disk_reservation_get(c, &h->s->res,
2096  					h->blocksize,
2097  					h->s->nr_parity,
2098  					BCH_DISK_RESERVATION_NOFAIL);
2099  		if (ret)
2100  			return ret;
2101  	}
2102  
2103  	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
2104  			   BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
2105  		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
2106  			if (start_pos.offset) {
2107  				start_pos = min_pos;
2108  				bch2_btree_iter_set_pos(&iter, start_pos);
2109  				continue;
2110  			}
2111  
2112  			ret = -BCH_ERR_ENOSPC_stripe_create;
2113  			break;
2114  		}
2115  
2116  		if (bkey_deleted(k.k) &&
2117  		    bch2_try_open_stripe(c, h->s, k.k->p.offset))
2118  			break;
2119  	}
2120  
2121  	c->ec_stripe_hint = iter.pos.offset;
2122  
2123  	if (ret)
2124  		goto err;
2125  
2126  	ret = ec_stripe_mem_alloc(trans, &iter);
2127  	if (ret) {
2128  		bch2_stripe_close(c, h->s);
2129  		goto err;
2130  	}
2131  
2132  	h->s->new_stripe.key.k.p = iter.pos;
2133  out:
2134  	bch2_trans_iter_exit(trans, &iter);
2135  	return ret;
2136  err:
2137  	bch2_disk_reservation_put(c, &h->s->res);
2138  	goto out;
2139  }
2140  
bch2_ec_stripe_head_get(struct btree_trans * trans,unsigned target,unsigned algo,unsigned redundancy,enum bch_watermark watermark,struct closure * cl)2141  struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
2142  					       unsigned target,
2143  					       unsigned algo,
2144  					       unsigned redundancy,
2145  					       enum bch_watermark watermark,
2146  					       struct closure *cl)
2147  {
2148  	struct bch_fs *c = trans->c;
2149  	struct ec_stripe_head *h;
2150  	bool waiting = false;
2151  	unsigned disk_label = 0;
2152  	struct target t = target_decode(target);
2153  	int ret;
2154  
2155  	if (t.type == TARGET_GROUP) {
2156  		if (t.group > U8_MAX) {
2157  			bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
2158  			return NULL;
2159  		}
2160  		disk_label = t.group + 1; /* 0 == no label */
2161  	}
2162  
2163  	h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
2164  	if (IS_ERR_OR_NULL(h))
2165  		return h;
2166  
2167  	if (!h->s) {
2168  		ret = ec_new_stripe_alloc(c, h);
2169  		if (ret) {
2170  			bch_err(c, "failed to allocate new stripe");
2171  			goto err;
2172  		}
2173  	}
2174  
2175  	if (h->s->allocated)
2176  		goto allocated;
2177  
2178  	if (h->s->have_existing_stripe)
2179  		goto alloc_existing;
2180  
2181  	/* First, try to allocate a full stripe: */
2182  	ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
2183  		__bch2_ec_stripe_head_reserve(trans, h);
2184  	if (!ret)
2185  		goto allocate_buf;
2186  	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
2187  	    bch2_err_matches(ret, ENOMEM))
2188  		goto err;
2189  
2190  	/*
2191  	 * Not enough buckets available for a full stripe: we must reuse an
2192  	 * existing stripe:
2193  	 */
2194  	while (1) {
2195  		ret = __bch2_ec_stripe_head_reuse(trans, h);
2196  		if (!ret)
2197  			break;
2198  		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
2199  			goto err;
2200  
2201  		if (watermark == BCH_WATERMARK_copygc) {
2202  			ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
2203  				__bch2_ec_stripe_head_reserve(trans, h);
2204  			if (ret)
2205  				goto err;
2206  			goto allocate_buf;
2207  		}
2208  
2209  		/* XXX freelist_wait? */
2210  		closure_wait(&c->freelist_wait, cl);
2211  		waiting = true;
2212  	}
2213  
2214  	if (waiting)
2215  		closure_wake_up(&c->freelist_wait);
2216  alloc_existing:
2217  	/*
2218  	 * Retry allocating buckets, with the watermark for this
2219  	 * particular write:
2220  	 */
2221  	ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
2222  	if (ret)
2223  		goto err;
2224  
2225  allocate_buf:
2226  	ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
2227  	if (ret)
2228  		goto err;
2229  
2230  	h->s->allocated = true;
2231  allocated:
2232  	BUG_ON(!h->s->idx);
2233  	BUG_ON(!h->s->new_stripe.data[0]);
2234  	BUG_ON(trans->restarted);
2235  	return h;
2236  err:
2237  	bch2_ec_stripe_head_put(c, h);
2238  	return ERR_PTR(ret);
2239  }
2240  
2241  /* device removal */
2242  
bch2_invalidate_stripe_to_dev(struct btree_trans * trans,struct bkey_s_c k_a)2243  static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
2244  {
2245  	struct bch_alloc_v4 a_convert;
2246  	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
2247  
2248  	if (!a->stripe)
2249  		return 0;
2250  
2251  	if (a->stripe_sectors) {
2252  		bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
2253  		return -BCH_ERR_invalidate_stripe_to_dev;
2254  	}
2255  
2256  	struct btree_iter iter;
2257  	struct bkey_i_stripe *s =
2258  		bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
2259  					BTREE_ITER_slots, stripe);
2260  	int ret = PTR_ERR_OR_ZERO(s);
2261  	if (ret)
2262  		return ret;
2263  
2264  	struct disk_accounting_pos acc = {
2265  		.type = BCH_DISK_ACCOUNTING_replicas,
2266  	};
2267  
2268  	s64 sectors = 0;
2269  	for (unsigned i = 0; i < s->v.nr_blocks; i++)
2270  		sectors -= stripe_blockcount_get(&s->v, i);
2271  
2272  	bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
2273  	acc.replicas.data_type = BCH_DATA_user;
2274  	ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
2275  	if (ret)
2276  		goto err;
2277  
2278  	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
2279  	bkey_for_each_ptr(ptrs, ptr)
2280  		if (ptr->dev == k_a.k->p.inode)
2281  			ptr->dev = BCH_SB_MEMBER_INVALID;
2282  
2283  	sectors = -sectors;
2284  
2285  	bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
2286  	acc.replicas.data_type = BCH_DATA_user;
2287  	ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
2288  	if (ret)
2289  		goto err;
2290  err:
2291  	bch2_trans_iter_exit(trans, &iter);
2292  	return ret;
2293  }
2294  
bch2_dev_remove_stripes(struct bch_fs * c,unsigned dev_idx)2295  int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
2296  {
2297  	return bch2_trans_run(c,
2298  		for_each_btree_key_upto_commit(trans, iter,
2299  				  BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
2300  				  BTREE_ITER_intent, k,
2301  				  NULL, NULL, 0, ({
2302  			bch2_invalidate_stripe_to_dev(trans, k);
2303  	})));
2304  }
2305  
2306  /* startup/shutdown */
2307  
__bch2_ec_stop(struct bch_fs * c,struct bch_dev * ca)2308  static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
2309  {
2310  	struct ec_stripe_head *h;
2311  	struct open_bucket *ob;
2312  	unsigned i;
2313  
2314  	mutex_lock(&c->ec_stripe_head_lock);
2315  	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2316  		mutex_lock(&h->lock);
2317  		if (!h->s)
2318  			goto unlock;
2319  
2320  		if (!ca)
2321  			goto found;
2322  
2323  		for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
2324  			if (!h->s->blocks[i])
2325  				continue;
2326  
2327  			ob = c->open_buckets + h->s->blocks[i];
2328  			if (ob->dev == ca->dev_idx)
2329  				goto found;
2330  		}
2331  		goto unlock;
2332  found:
2333  		ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
2334  unlock:
2335  		mutex_unlock(&h->lock);
2336  	}
2337  	mutex_unlock(&c->ec_stripe_head_lock);
2338  }
2339  
bch2_ec_stop_dev(struct bch_fs * c,struct bch_dev * ca)2340  void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
2341  {
2342  	__bch2_ec_stop(c, ca);
2343  }
2344  
bch2_fs_ec_stop(struct bch_fs * c)2345  void bch2_fs_ec_stop(struct bch_fs *c)
2346  {
2347  	__bch2_ec_stop(c, NULL);
2348  }
2349  
bch2_fs_ec_flush_done(struct bch_fs * c)2350  static bool bch2_fs_ec_flush_done(struct bch_fs *c)
2351  {
2352  	bool ret;
2353  
2354  	mutex_lock(&c->ec_stripe_new_lock);
2355  	ret = list_empty(&c->ec_stripe_new_list);
2356  	mutex_unlock(&c->ec_stripe_new_lock);
2357  
2358  	return ret;
2359  }
2360  
bch2_fs_ec_flush(struct bch_fs * c)2361  void bch2_fs_ec_flush(struct bch_fs *c)
2362  {
2363  	wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
2364  }
2365  
bch2_stripes_read(struct bch_fs * c)2366  int bch2_stripes_read(struct bch_fs *c)
2367  {
2368  	int ret = bch2_trans_run(c,
2369  		for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
2370  				   BTREE_ITER_prefetch, k, ({
2371  			if (k.k->type != KEY_TYPE_stripe)
2372  				continue;
2373  
2374  			ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
2375  			if (ret)
2376  				break;
2377  
2378  			struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
2379  
2380  			stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
2381  
2382  			bch2_stripes_heap_insert(c, m, k.k->p.offset);
2383  			0;
2384  		})));
2385  	bch_err_fn(c, ret);
2386  	return ret;
2387  }
2388  
bch2_stripes_heap_to_text(struct printbuf * out,struct bch_fs * c)2389  void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
2390  {
2391  	ec_stripes_heap *h = &c->ec_stripes_heap;
2392  	struct stripe *m;
2393  	size_t i;
2394  
2395  	mutex_lock(&c->ec_stripes_heap_lock);
2396  	for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
2397  		m = genradix_ptr(&c->stripes, h->data[i].idx);
2398  
2399  		prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
2400  		       h->data[i].blocks_nonempty,
2401  		       m->nr_blocks - m->nr_redundant,
2402  		       m->nr_redundant);
2403  		if (bch2_stripe_is_open(c, h->data[i].idx))
2404  			prt_str(out, " open");
2405  		prt_newline(out);
2406  	}
2407  	mutex_unlock(&c->ec_stripes_heap_lock);
2408  }
2409  
bch2_new_stripe_to_text(struct printbuf * out,struct bch_fs * c,struct ec_stripe_new * s)2410  static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
2411  				    struct ec_stripe_new *s)
2412  {
2413  	prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
2414  		   s->idx, s->nr_data, s->nr_parity,
2415  		   bitmap_weight(s->blocks_allocated, s->nr_data),
2416  		   atomic_read(&s->ref[STRIPE_REF_io]),
2417  		   atomic_read(&s->ref[STRIPE_REF_stripe]),
2418  		   bch2_watermarks[s->h->watermark]);
2419  
2420  	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
2421  	unsigned i;
2422  	for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
2423  		prt_printf(out, " %u", s->blocks[i]);
2424  	prt_newline(out);
2425  	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
2426  	prt_newline(out);
2427  }
2428  
bch2_new_stripes_to_text(struct printbuf * out,struct bch_fs * c)2429  void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
2430  {
2431  	struct ec_stripe_head *h;
2432  	struct ec_stripe_new *s;
2433  
2434  	mutex_lock(&c->ec_stripe_head_lock);
2435  	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2436  		prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
2437  		       h->disk_label, h->algo, h->redundancy,
2438  		       bch2_watermarks[h->watermark],
2439  		       h->nr_created);
2440  
2441  		if (h->s)
2442  			bch2_new_stripe_to_text(out, c, h->s);
2443  	}
2444  	mutex_unlock(&c->ec_stripe_head_lock);
2445  
2446  	prt_printf(out, "in flight:\n");
2447  
2448  	mutex_lock(&c->ec_stripe_new_lock);
2449  	list_for_each_entry(s, &c->ec_stripe_new_list, list)
2450  		bch2_new_stripe_to_text(out, c, s);
2451  	mutex_unlock(&c->ec_stripe_new_lock);
2452  }
2453  
bch2_fs_ec_exit(struct bch_fs * c)2454  void bch2_fs_ec_exit(struct bch_fs *c)
2455  {
2456  	struct ec_stripe_head *h;
2457  	unsigned i;
2458  
2459  	while (1) {
2460  		mutex_lock(&c->ec_stripe_head_lock);
2461  		h = list_first_entry_or_null(&c->ec_stripe_head_list,
2462  					     struct ec_stripe_head, list);
2463  		if (h)
2464  			list_del(&h->list);
2465  		mutex_unlock(&c->ec_stripe_head_lock);
2466  		if (!h)
2467  			break;
2468  
2469  		if (h->s) {
2470  			for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
2471  				BUG_ON(h->s->blocks[i]);
2472  
2473  			kfree(h->s);
2474  		}
2475  		kfree(h);
2476  	}
2477  
2478  	BUG_ON(!list_empty(&c->ec_stripe_new_list));
2479  
2480  	free_heap(&c->ec_stripes_heap);
2481  	genradix_free(&c->stripes);
2482  	bioset_exit(&c->ec_bioset);
2483  }
2484  
bch2_fs_ec_init_early(struct bch_fs * c)2485  void bch2_fs_ec_init_early(struct bch_fs *c)
2486  {
2487  	spin_lock_init(&c->ec_stripes_new_lock);
2488  	mutex_init(&c->ec_stripes_heap_lock);
2489  
2490  	INIT_LIST_HEAD(&c->ec_stripe_head_list);
2491  	mutex_init(&c->ec_stripe_head_lock);
2492  
2493  	INIT_LIST_HEAD(&c->ec_stripe_new_list);
2494  	mutex_init(&c->ec_stripe_new_lock);
2495  	init_waitqueue_head(&c->ec_stripe_new_wait);
2496  
2497  	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
2498  	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
2499  }
2500  
bch2_fs_ec_init(struct bch_fs * c)2501  int bch2_fs_ec_init(struct bch_fs *c)
2502  {
2503  	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
2504  			   BIOSET_NEED_BVECS);
2505  }
2506