1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2012 Fusion-io  All rights reserved.
4   * Copyright (C) 2012 Intel Corp. All rights reserved.
5   */
6  
7  #include <linux/sched.h>
8  #include <linux/bio.h>
9  #include <linux/slab.h>
10  #include <linux/blkdev.h>
11  #include <linux/raid/pq.h>
12  #include <linux/hash.h>
13  #include <linux/list_sort.h>
14  #include <linux/raid/xor.h>
15  #include <linux/mm.h>
16  #include "messages.h"
17  #include "ctree.h"
18  #include "disk-io.h"
19  #include "volumes.h"
20  #include "raid56.h"
21  #include "async-thread.h"
22  #include "file-item.h"
23  #include "btrfs_inode.h"
24  
25  /* set when additional merges to this rbio are not allowed */
26  #define RBIO_RMW_LOCKED_BIT	1
27  
28  /*
29   * set when this rbio is sitting in the hash, but it is just a cache
30   * of past RMW
31   */
32  #define RBIO_CACHE_BIT		2
33  
34  /*
35   * set when it is safe to trust the stripe_pages for caching
36   */
37  #define RBIO_CACHE_READY_BIT	3
38  
39  #define RBIO_CACHE_SIZE 1024
40  
41  #define BTRFS_STRIPE_HASH_TABLE_BITS				11
42  
dump_bioc(const struct btrfs_fs_info * fs_info,const struct btrfs_io_context * bioc)43  static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
44  {
45  	if (unlikely(!bioc)) {
46  		btrfs_crit(fs_info, "bioc=NULL");
47  		return;
48  	}
49  	btrfs_crit(fs_info,
50  "bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
51  		bioc->logical, bioc->full_stripe_logical, bioc->size,
52  		bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
53  		bioc->replace_stripe_src, bioc->num_stripes);
54  	for (int i = 0; i < bioc->num_stripes; i++) {
55  		btrfs_crit(fs_info, "    nr=%d devid=%llu physical=%llu",
56  			   i, bioc->stripes[i].dev->devid,
57  			   bioc->stripes[i].physical);
58  	}
59  }
60  
btrfs_dump_rbio(const struct btrfs_fs_info * fs_info,const struct btrfs_raid_bio * rbio)61  static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
62  			    const struct btrfs_raid_bio *rbio)
63  {
64  	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
65  		return;
66  
67  	dump_bioc(fs_info, rbio->bioc);
68  	btrfs_crit(fs_info,
69  "rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
70  		rbio->flags, rbio->nr_sectors, rbio->nr_data,
71  		rbio->real_stripes, rbio->stripe_nsectors,
72  		rbio->scrubp, rbio->dbitmap);
73  }
74  
75  #define ASSERT_RBIO(expr, rbio)						\
76  ({									\
77  	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
78  		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
79  					(rbio)->bioc->fs_info : NULL;	\
80  									\
81  		btrfs_dump_rbio(__fs_info, (rbio));			\
82  	}								\
83  	ASSERT((expr));							\
84  })
85  
86  #define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr)			\
87  ({									\
88  	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
89  		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
90  					(rbio)->bioc->fs_info : NULL;	\
91  									\
92  		btrfs_dump_rbio(__fs_info, (rbio));			\
93  		btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr));	\
94  	}								\
95  	ASSERT((expr));							\
96  })
97  
98  #define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr)			\
99  ({									\
100  	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
101  		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
102  					(rbio)->bioc->fs_info : NULL;	\
103  									\
104  		btrfs_dump_rbio(__fs_info, (rbio));			\
105  		btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr));	\
106  	}								\
107  	ASSERT((expr));							\
108  })
109  
110  #define ASSERT_RBIO_LOGICAL(expr, rbio, logical)			\
111  ({									\
112  	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
113  		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
114  					(rbio)->bioc->fs_info : NULL;	\
115  									\
116  		btrfs_dump_rbio(__fs_info, (rbio));			\
117  		btrfs_crit(__fs_info, "logical=%llu", (logical));		\
118  	}								\
119  	ASSERT((expr));							\
120  })
121  
122  /* Used by the raid56 code to lock stripes for read/modify/write */
123  struct btrfs_stripe_hash {
124  	struct list_head hash_list;
125  	spinlock_t lock;
126  };
127  
128  /* Used by the raid56 code to lock stripes for read/modify/write */
129  struct btrfs_stripe_hash_table {
130  	struct list_head stripe_cache;
131  	spinlock_t cache_lock;
132  	int cache_size;
133  	struct btrfs_stripe_hash table[];
134  };
135  
136  /*
137   * A bvec like structure to present a sector inside a page.
138   *
139   * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
140   */
141  struct sector_ptr {
142  	struct page *page;
143  	unsigned int pgoff:24;
144  	unsigned int uptodate:8;
145  };
146  
147  static void rmw_rbio_work(struct work_struct *work);
148  static void rmw_rbio_work_locked(struct work_struct *work);
149  static void index_rbio_pages(struct btrfs_raid_bio *rbio);
150  static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
151  
152  static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
153  static void scrub_rbio_work_locked(struct work_struct *work);
154  
free_raid_bio_pointers(struct btrfs_raid_bio * rbio)155  static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
156  {
157  	bitmap_free(rbio->error_bitmap);
158  	kfree(rbio->stripe_pages);
159  	kfree(rbio->bio_sectors);
160  	kfree(rbio->stripe_sectors);
161  	kfree(rbio->finish_pointers);
162  }
163  
free_raid_bio(struct btrfs_raid_bio * rbio)164  static void free_raid_bio(struct btrfs_raid_bio *rbio)
165  {
166  	int i;
167  
168  	if (!refcount_dec_and_test(&rbio->refs))
169  		return;
170  
171  	WARN_ON(!list_empty(&rbio->stripe_cache));
172  	WARN_ON(!list_empty(&rbio->hash_list));
173  	WARN_ON(!bio_list_empty(&rbio->bio_list));
174  
175  	for (i = 0; i < rbio->nr_pages; i++) {
176  		if (rbio->stripe_pages[i]) {
177  			__free_page(rbio->stripe_pages[i]);
178  			rbio->stripe_pages[i] = NULL;
179  		}
180  	}
181  
182  	btrfs_put_bioc(rbio->bioc);
183  	free_raid_bio_pointers(rbio);
184  	kfree(rbio);
185  }
186  
start_async_work(struct btrfs_raid_bio * rbio,work_func_t work_func)187  static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
188  {
189  	INIT_WORK(&rbio->work, work_func);
190  	queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
191  }
192  
193  /*
194   * the stripe hash table is used for locking, and to collect
195   * bios in hopes of making a full stripe
196   */
btrfs_alloc_stripe_hash_table(struct btrfs_fs_info * info)197  int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
198  {
199  	struct btrfs_stripe_hash_table *table;
200  	struct btrfs_stripe_hash_table *x;
201  	struct btrfs_stripe_hash *cur;
202  	struct btrfs_stripe_hash *h;
203  	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
204  	int i;
205  
206  	if (info->stripe_hash_table)
207  		return 0;
208  
209  	/*
210  	 * The table is large, starting with order 4 and can go as high as
211  	 * order 7 in case lock debugging is turned on.
212  	 *
213  	 * Try harder to allocate and fallback to vmalloc to lower the chance
214  	 * of a failing mount.
215  	 */
216  	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
217  	if (!table)
218  		return -ENOMEM;
219  
220  	spin_lock_init(&table->cache_lock);
221  	INIT_LIST_HEAD(&table->stripe_cache);
222  
223  	h = table->table;
224  
225  	for (i = 0; i < num_entries; i++) {
226  		cur = h + i;
227  		INIT_LIST_HEAD(&cur->hash_list);
228  		spin_lock_init(&cur->lock);
229  	}
230  
231  	x = cmpxchg(&info->stripe_hash_table, NULL, table);
232  	kvfree(x);
233  	return 0;
234  }
235  
236  /*
237   * caching an rbio means to copy anything from the
238   * bio_sectors array into the stripe_pages array.  We
239   * use the page uptodate bit in the stripe cache array
240   * to indicate if it has valid data
241   *
242   * once the caching is done, we set the cache ready
243   * bit.
244   */
cache_rbio_pages(struct btrfs_raid_bio * rbio)245  static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
246  {
247  	int i;
248  	int ret;
249  
250  	ret = alloc_rbio_pages(rbio);
251  	if (ret)
252  		return;
253  
254  	for (i = 0; i < rbio->nr_sectors; i++) {
255  		/* Some range not covered by bio (partial write), skip it */
256  		if (!rbio->bio_sectors[i].page) {
257  			/*
258  			 * Even if the sector is not covered by bio, if it is
259  			 * a data sector it should still be uptodate as it is
260  			 * read from disk.
261  			 */
262  			if (i < rbio->nr_data * rbio->stripe_nsectors)
263  				ASSERT(rbio->stripe_sectors[i].uptodate);
264  			continue;
265  		}
266  
267  		ASSERT(rbio->stripe_sectors[i].page);
268  		memcpy_page(rbio->stripe_sectors[i].page,
269  			    rbio->stripe_sectors[i].pgoff,
270  			    rbio->bio_sectors[i].page,
271  			    rbio->bio_sectors[i].pgoff,
272  			    rbio->bioc->fs_info->sectorsize);
273  		rbio->stripe_sectors[i].uptodate = 1;
274  	}
275  	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
276  }
277  
278  /*
279   * we hash on the first logical address of the stripe
280   */
rbio_bucket(struct btrfs_raid_bio * rbio)281  static int rbio_bucket(struct btrfs_raid_bio *rbio)
282  {
283  	u64 num = rbio->bioc->full_stripe_logical;
284  
285  	/*
286  	 * we shift down quite a bit.  We're using byte
287  	 * addressing, and most of the lower bits are zeros.
288  	 * This tends to upset hash_64, and it consistently
289  	 * returns just one or two different values.
290  	 *
291  	 * shifting off the lower bits fixes things.
292  	 */
293  	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
294  }
295  
full_page_sectors_uptodate(struct btrfs_raid_bio * rbio,unsigned int page_nr)296  static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
297  				       unsigned int page_nr)
298  {
299  	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
300  	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
301  	int i;
302  
303  	ASSERT(page_nr < rbio->nr_pages);
304  
305  	for (i = sectors_per_page * page_nr;
306  	     i < sectors_per_page * page_nr + sectors_per_page;
307  	     i++) {
308  		if (!rbio->stripe_sectors[i].uptodate)
309  			return false;
310  	}
311  	return true;
312  }
313  
314  /*
315   * Update the stripe_sectors[] array to use correct page and pgoff
316   *
317   * Should be called every time any page pointer in stripes_pages[] got modified.
318   */
index_stripe_sectors(struct btrfs_raid_bio * rbio)319  static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
320  {
321  	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
322  	u32 offset;
323  	int i;
324  
325  	for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
326  		int page_index = offset >> PAGE_SHIFT;
327  
328  		ASSERT(page_index < rbio->nr_pages);
329  		rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
330  		rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
331  	}
332  }
333  
steal_rbio_page(struct btrfs_raid_bio * src,struct btrfs_raid_bio * dest,int page_nr)334  static void steal_rbio_page(struct btrfs_raid_bio *src,
335  			    struct btrfs_raid_bio *dest, int page_nr)
336  {
337  	const u32 sectorsize = src->bioc->fs_info->sectorsize;
338  	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
339  	int i;
340  
341  	if (dest->stripe_pages[page_nr])
342  		__free_page(dest->stripe_pages[page_nr]);
343  	dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
344  	src->stripe_pages[page_nr] = NULL;
345  
346  	/* Also update the sector->uptodate bits. */
347  	for (i = sectors_per_page * page_nr;
348  	     i < sectors_per_page * page_nr + sectors_per_page; i++)
349  		dest->stripe_sectors[i].uptodate = true;
350  }
351  
is_data_stripe_page(struct btrfs_raid_bio * rbio,int page_nr)352  static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
353  {
354  	const int sector_nr = (page_nr << PAGE_SHIFT) >>
355  			      rbio->bioc->fs_info->sectorsize_bits;
356  
357  	/*
358  	 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
359  	 * we won't have a page which is half data half parity.
360  	 *
361  	 * Thus if the first sector of the page belongs to data stripes, then
362  	 * the full page belongs to data stripes.
363  	 */
364  	return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
365  }
366  
367  /*
368   * Stealing an rbio means taking all the uptodate pages from the stripe array
369   * in the source rbio and putting them into the destination rbio.
370   *
371   * This will also update the involved stripe_sectors[] which are referring to
372   * the old pages.
373   */
steal_rbio(struct btrfs_raid_bio * src,struct btrfs_raid_bio * dest)374  static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
375  {
376  	int i;
377  
378  	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
379  		return;
380  
381  	for (i = 0; i < dest->nr_pages; i++) {
382  		struct page *p = src->stripe_pages[i];
383  
384  		/*
385  		 * We don't need to steal P/Q pages as they will always be
386  		 * regenerated for RMW or full write anyway.
387  		 */
388  		if (!is_data_stripe_page(src, i))
389  			continue;
390  
391  		/*
392  		 * If @src already has RBIO_CACHE_READY_BIT, it should have
393  		 * all data stripe pages present and uptodate.
394  		 */
395  		ASSERT(p);
396  		ASSERT(full_page_sectors_uptodate(src, i));
397  		steal_rbio_page(src, dest, i);
398  	}
399  	index_stripe_sectors(dest);
400  	index_stripe_sectors(src);
401  }
402  
403  /*
404   * merging means we take the bio_list from the victim and
405   * splice it into the destination.  The victim should
406   * be discarded afterwards.
407   *
408   * must be called with dest->rbio_list_lock held
409   */
merge_rbio(struct btrfs_raid_bio * dest,struct btrfs_raid_bio * victim)410  static void merge_rbio(struct btrfs_raid_bio *dest,
411  		       struct btrfs_raid_bio *victim)
412  {
413  	bio_list_merge_init(&dest->bio_list, &victim->bio_list);
414  	dest->bio_list_bytes += victim->bio_list_bytes;
415  	/* Also inherit the bitmaps from @victim. */
416  	bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
417  		  dest->stripe_nsectors);
418  }
419  
420  /*
421   * used to prune items that are in the cache.  The caller
422   * must hold the hash table lock.
423   */
__remove_rbio_from_cache(struct btrfs_raid_bio * rbio)424  static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
425  {
426  	int bucket = rbio_bucket(rbio);
427  	struct btrfs_stripe_hash_table *table;
428  	struct btrfs_stripe_hash *h;
429  	int freeit = 0;
430  
431  	/*
432  	 * check the bit again under the hash table lock.
433  	 */
434  	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
435  		return;
436  
437  	table = rbio->bioc->fs_info->stripe_hash_table;
438  	h = table->table + bucket;
439  
440  	/* hold the lock for the bucket because we may be
441  	 * removing it from the hash table
442  	 */
443  	spin_lock(&h->lock);
444  
445  	/*
446  	 * hold the lock for the bio list because we need
447  	 * to make sure the bio list is empty
448  	 */
449  	spin_lock(&rbio->bio_list_lock);
450  
451  	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
452  		list_del_init(&rbio->stripe_cache);
453  		table->cache_size -= 1;
454  		freeit = 1;
455  
456  		/* if the bio list isn't empty, this rbio is
457  		 * still involved in an IO.  We take it out
458  		 * of the cache list, and drop the ref that
459  		 * was held for the list.
460  		 *
461  		 * If the bio_list was empty, we also remove
462  		 * the rbio from the hash_table, and drop
463  		 * the corresponding ref
464  		 */
465  		if (bio_list_empty(&rbio->bio_list)) {
466  			if (!list_empty(&rbio->hash_list)) {
467  				list_del_init(&rbio->hash_list);
468  				refcount_dec(&rbio->refs);
469  				BUG_ON(!list_empty(&rbio->plug_list));
470  			}
471  		}
472  	}
473  
474  	spin_unlock(&rbio->bio_list_lock);
475  	spin_unlock(&h->lock);
476  
477  	if (freeit)
478  		free_raid_bio(rbio);
479  }
480  
481  /*
482   * prune a given rbio from the cache
483   */
remove_rbio_from_cache(struct btrfs_raid_bio * rbio)484  static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
485  {
486  	struct btrfs_stripe_hash_table *table;
487  
488  	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
489  		return;
490  
491  	table = rbio->bioc->fs_info->stripe_hash_table;
492  
493  	spin_lock(&table->cache_lock);
494  	__remove_rbio_from_cache(rbio);
495  	spin_unlock(&table->cache_lock);
496  }
497  
498  /*
499   * remove everything in the cache
500   */
btrfs_clear_rbio_cache(struct btrfs_fs_info * info)501  static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
502  {
503  	struct btrfs_stripe_hash_table *table;
504  	struct btrfs_raid_bio *rbio;
505  
506  	table = info->stripe_hash_table;
507  
508  	spin_lock(&table->cache_lock);
509  	while (!list_empty(&table->stripe_cache)) {
510  		rbio = list_entry(table->stripe_cache.next,
511  				  struct btrfs_raid_bio,
512  				  stripe_cache);
513  		__remove_rbio_from_cache(rbio);
514  	}
515  	spin_unlock(&table->cache_lock);
516  }
517  
518  /*
519   * remove all cached entries and free the hash table
520   * used by unmount
521   */
btrfs_free_stripe_hash_table(struct btrfs_fs_info * info)522  void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
523  {
524  	if (!info->stripe_hash_table)
525  		return;
526  	btrfs_clear_rbio_cache(info);
527  	kvfree(info->stripe_hash_table);
528  	info->stripe_hash_table = NULL;
529  }
530  
531  /*
532   * insert an rbio into the stripe cache.  It
533   * must have already been prepared by calling
534   * cache_rbio_pages
535   *
536   * If this rbio was already cached, it gets
537   * moved to the front of the lru.
538   *
539   * If the size of the rbio cache is too big, we
540   * prune an item.
541   */
cache_rbio(struct btrfs_raid_bio * rbio)542  static void cache_rbio(struct btrfs_raid_bio *rbio)
543  {
544  	struct btrfs_stripe_hash_table *table;
545  
546  	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
547  		return;
548  
549  	table = rbio->bioc->fs_info->stripe_hash_table;
550  
551  	spin_lock(&table->cache_lock);
552  	spin_lock(&rbio->bio_list_lock);
553  
554  	/* bump our ref if we were not in the list before */
555  	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
556  		refcount_inc(&rbio->refs);
557  
558  	if (!list_empty(&rbio->stripe_cache)){
559  		list_move(&rbio->stripe_cache, &table->stripe_cache);
560  	} else {
561  		list_add(&rbio->stripe_cache, &table->stripe_cache);
562  		table->cache_size += 1;
563  	}
564  
565  	spin_unlock(&rbio->bio_list_lock);
566  
567  	if (table->cache_size > RBIO_CACHE_SIZE) {
568  		struct btrfs_raid_bio *found;
569  
570  		found = list_entry(table->stripe_cache.prev,
571  				  struct btrfs_raid_bio,
572  				  stripe_cache);
573  
574  		if (found != rbio)
575  			__remove_rbio_from_cache(found);
576  	}
577  
578  	spin_unlock(&table->cache_lock);
579  }
580  
581  /*
582   * helper function to run the xor_blocks api.  It is only
583   * able to do MAX_XOR_BLOCKS at a time, so we need to
584   * loop through.
585   */
run_xor(void ** pages,int src_cnt,ssize_t len)586  static void run_xor(void **pages, int src_cnt, ssize_t len)
587  {
588  	int src_off = 0;
589  	int xor_src_cnt = 0;
590  	void *dest = pages[src_cnt];
591  
592  	while(src_cnt > 0) {
593  		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
594  		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
595  
596  		src_cnt -= xor_src_cnt;
597  		src_off += xor_src_cnt;
598  	}
599  }
600  
601  /*
602   * Returns true if the bio list inside this rbio covers an entire stripe (no
603   * rmw required).
604   */
rbio_is_full(struct btrfs_raid_bio * rbio)605  static int rbio_is_full(struct btrfs_raid_bio *rbio)
606  {
607  	unsigned long size = rbio->bio_list_bytes;
608  	int ret = 1;
609  
610  	spin_lock(&rbio->bio_list_lock);
611  	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
612  		ret = 0;
613  	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
614  	spin_unlock(&rbio->bio_list_lock);
615  
616  	return ret;
617  }
618  
619  /*
620   * returns 1 if it is safe to merge two rbios together.
621   * The merging is safe if the two rbios correspond to
622   * the same stripe and if they are both going in the same
623   * direction (read vs write), and if neither one is
624   * locked for final IO
625   *
626   * The caller is responsible for locking such that
627   * rmw_locked is safe to test
628   */
rbio_can_merge(struct btrfs_raid_bio * last,struct btrfs_raid_bio * cur)629  static int rbio_can_merge(struct btrfs_raid_bio *last,
630  			  struct btrfs_raid_bio *cur)
631  {
632  	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
633  	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
634  		return 0;
635  
636  	/*
637  	 * we can't merge with cached rbios, since the
638  	 * idea is that when we merge the destination
639  	 * rbio is going to run our IO for us.  We can
640  	 * steal from cached rbios though, other functions
641  	 * handle that.
642  	 */
643  	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
644  	    test_bit(RBIO_CACHE_BIT, &cur->flags))
645  		return 0;
646  
647  	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
648  		return 0;
649  
650  	/* we can't merge with different operations */
651  	if (last->operation != cur->operation)
652  		return 0;
653  	/*
654  	 * We've need read the full stripe from the drive.
655  	 * check and repair the parity and write the new results.
656  	 *
657  	 * We're not allowed to add any new bios to the
658  	 * bio list here, anyone else that wants to
659  	 * change this stripe needs to do their own rmw.
660  	 */
661  	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
662  		return 0;
663  
664  	if (last->operation == BTRFS_RBIO_READ_REBUILD)
665  		return 0;
666  
667  	return 1;
668  }
669  
rbio_stripe_sector_index(const struct btrfs_raid_bio * rbio,unsigned int stripe_nr,unsigned int sector_nr)670  static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
671  					     unsigned int stripe_nr,
672  					     unsigned int sector_nr)
673  {
674  	ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
675  	ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
676  
677  	return stripe_nr * rbio->stripe_nsectors + sector_nr;
678  }
679  
680  /* Return a sector from rbio->stripe_sectors, not from the bio list */
rbio_stripe_sector(const struct btrfs_raid_bio * rbio,unsigned int stripe_nr,unsigned int sector_nr)681  static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
682  					     unsigned int stripe_nr,
683  					     unsigned int sector_nr)
684  {
685  	return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
686  							      sector_nr)];
687  }
688  
689  /* Grab a sector inside P stripe */
rbio_pstripe_sector(const struct btrfs_raid_bio * rbio,unsigned int sector_nr)690  static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
691  					      unsigned int sector_nr)
692  {
693  	return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
694  }
695  
696  /* Grab a sector inside Q stripe, return NULL if not RAID6 */
rbio_qstripe_sector(const struct btrfs_raid_bio * rbio,unsigned int sector_nr)697  static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
698  					      unsigned int sector_nr)
699  {
700  	if (rbio->nr_data + 1 == rbio->real_stripes)
701  		return NULL;
702  	return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
703  }
704  
705  /*
706   * The first stripe in the table for a logical address
707   * has the lock.  rbios are added in one of three ways:
708   *
709   * 1) Nobody has the stripe locked yet.  The rbio is given
710   * the lock and 0 is returned.  The caller must start the IO
711   * themselves.
712   *
713   * 2) Someone has the stripe locked, but we're able to merge
714   * with the lock owner.  The rbio is freed and the IO will
715   * start automatically along with the existing rbio.  1 is returned.
716   *
717   * 3) Someone has the stripe locked, but we're not able to merge.
718   * The rbio is added to the lock owner's plug list, or merged into
719   * an rbio already on the plug list.  When the lock owner unlocks,
720   * the next rbio on the list is run and the IO is started automatically.
721   * 1 is returned
722   *
723   * If we return 0, the caller still owns the rbio and must continue with
724   * IO submission.  If we return 1, the caller must assume the rbio has
725   * already been freed.
726   */
lock_stripe_add(struct btrfs_raid_bio * rbio)727  static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
728  {
729  	struct btrfs_stripe_hash *h;
730  	struct btrfs_raid_bio *cur;
731  	struct btrfs_raid_bio *pending;
732  	struct btrfs_raid_bio *freeit = NULL;
733  	struct btrfs_raid_bio *cache_drop = NULL;
734  	int ret = 0;
735  
736  	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
737  
738  	spin_lock(&h->lock);
739  	list_for_each_entry(cur, &h->hash_list, hash_list) {
740  		if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
741  			continue;
742  
743  		spin_lock(&cur->bio_list_lock);
744  
745  		/* Can we steal this cached rbio's pages? */
746  		if (bio_list_empty(&cur->bio_list) &&
747  		    list_empty(&cur->plug_list) &&
748  		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
749  		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
750  			list_del_init(&cur->hash_list);
751  			refcount_dec(&cur->refs);
752  
753  			steal_rbio(cur, rbio);
754  			cache_drop = cur;
755  			spin_unlock(&cur->bio_list_lock);
756  
757  			goto lockit;
758  		}
759  
760  		/* Can we merge into the lock owner? */
761  		if (rbio_can_merge(cur, rbio)) {
762  			merge_rbio(cur, rbio);
763  			spin_unlock(&cur->bio_list_lock);
764  			freeit = rbio;
765  			ret = 1;
766  			goto out;
767  		}
768  
769  
770  		/*
771  		 * We couldn't merge with the running rbio, see if we can merge
772  		 * with the pending ones.  We don't have to check for rmw_locked
773  		 * because there is no way they are inside finish_rmw right now
774  		 */
775  		list_for_each_entry(pending, &cur->plug_list, plug_list) {
776  			if (rbio_can_merge(pending, rbio)) {
777  				merge_rbio(pending, rbio);
778  				spin_unlock(&cur->bio_list_lock);
779  				freeit = rbio;
780  				ret = 1;
781  				goto out;
782  			}
783  		}
784  
785  		/*
786  		 * No merging, put us on the tail of the plug list, our rbio
787  		 * will be started with the currently running rbio unlocks
788  		 */
789  		list_add_tail(&rbio->plug_list, &cur->plug_list);
790  		spin_unlock(&cur->bio_list_lock);
791  		ret = 1;
792  		goto out;
793  	}
794  lockit:
795  	refcount_inc(&rbio->refs);
796  	list_add(&rbio->hash_list, &h->hash_list);
797  out:
798  	spin_unlock(&h->lock);
799  	if (cache_drop)
800  		remove_rbio_from_cache(cache_drop);
801  	if (freeit)
802  		free_raid_bio(freeit);
803  	return ret;
804  }
805  
806  static void recover_rbio_work_locked(struct work_struct *work);
807  
808  /*
809   * called as rmw or parity rebuild is completed.  If the plug list has more
810   * rbios waiting for this stripe, the next one on the list will be started
811   */
unlock_stripe(struct btrfs_raid_bio * rbio)812  static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
813  {
814  	int bucket;
815  	struct btrfs_stripe_hash *h;
816  	int keep_cache = 0;
817  
818  	bucket = rbio_bucket(rbio);
819  	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
820  
821  	if (list_empty(&rbio->plug_list))
822  		cache_rbio(rbio);
823  
824  	spin_lock(&h->lock);
825  	spin_lock(&rbio->bio_list_lock);
826  
827  	if (!list_empty(&rbio->hash_list)) {
828  		/*
829  		 * if we're still cached and there is no other IO
830  		 * to perform, just leave this rbio here for others
831  		 * to steal from later
832  		 */
833  		if (list_empty(&rbio->plug_list) &&
834  		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
835  			keep_cache = 1;
836  			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
837  			BUG_ON(!bio_list_empty(&rbio->bio_list));
838  			goto done;
839  		}
840  
841  		list_del_init(&rbio->hash_list);
842  		refcount_dec(&rbio->refs);
843  
844  		/*
845  		 * we use the plug list to hold all the rbios
846  		 * waiting for the chance to lock this stripe.
847  		 * hand the lock over to one of them.
848  		 */
849  		if (!list_empty(&rbio->plug_list)) {
850  			struct btrfs_raid_bio *next;
851  			struct list_head *head = rbio->plug_list.next;
852  
853  			next = list_entry(head, struct btrfs_raid_bio,
854  					  plug_list);
855  
856  			list_del_init(&rbio->plug_list);
857  
858  			list_add(&next->hash_list, &h->hash_list);
859  			refcount_inc(&next->refs);
860  			spin_unlock(&rbio->bio_list_lock);
861  			spin_unlock(&h->lock);
862  
863  			if (next->operation == BTRFS_RBIO_READ_REBUILD) {
864  				start_async_work(next, recover_rbio_work_locked);
865  			} else if (next->operation == BTRFS_RBIO_WRITE) {
866  				steal_rbio(rbio, next);
867  				start_async_work(next, rmw_rbio_work_locked);
868  			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
869  				steal_rbio(rbio, next);
870  				start_async_work(next, scrub_rbio_work_locked);
871  			}
872  
873  			goto done_nolock;
874  		}
875  	}
876  done:
877  	spin_unlock(&rbio->bio_list_lock);
878  	spin_unlock(&h->lock);
879  
880  done_nolock:
881  	if (!keep_cache)
882  		remove_rbio_from_cache(rbio);
883  }
884  
rbio_endio_bio_list(struct bio * cur,blk_status_t err)885  static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
886  {
887  	struct bio *next;
888  
889  	while (cur) {
890  		next = cur->bi_next;
891  		cur->bi_next = NULL;
892  		cur->bi_status = err;
893  		bio_endio(cur);
894  		cur = next;
895  	}
896  }
897  
898  /*
899   * this frees the rbio and runs through all the bios in the
900   * bio_list and calls end_io on them
901   */
rbio_orig_end_io(struct btrfs_raid_bio * rbio,blk_status_t err)902  static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
903  {
904  	struct bio *cur = bio_list_get(&rbio->bio_list);
905  	struct bio *extra;
906  
907  	kfree(rbio->csum_buf);
908  	bitmap_free(rbio->csum_bitmap);
909  	rbio->csum_buf = NULL;
910  	rbio->csum_bitmap = NULL;
911  
912  	/*
913  	 * Clear the data bitmap, as the rbio may be cached for later usage.
914  	 * do this before before unlock_stripe() so there will be no new bio
915  	 * for this bio.
916  	 */
917  	bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
918  
919  	/*
920  	 * At this moment, rbio->bio_list is empty, however since rbio does not
921  	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
922  	 * hash list, rbio may be merged with others so that rbio->bio_list
923  	 * becomes non-empty.
924  	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
925  	 * more and we can call bio_endio() on all queued bios.
926  	 */
927  	unlock_stripe(rbio);
928  	extra = bio_list_get(&rbio->bio_list);
929  	free_raid_bio(rbio);
930  
931  	rbio_endio_bio_list(cur, err);
932  	if (extra)
933  		rbio_endio_bio_list(extra, err);
934  }
935  
936  /*
937   * Get a sector pointer specified by its @stripe_nr and @sector_nr.
938   *
939   * @rbio:               The raid bio
940   * @stripe_nr:          Stripe number, valid range [0, real_stripe)
941   * @sector_nr:		Sector number inside the stripe,
942   *			valid range [0, stripe_nsectors)
943   * @bio_list_only:      Whether to use sectors inside the bio list only.
944   *
945   * The read/modify/write code wants to reuse the original bio page as much
946   * as possible, and only use stripe_sectors as fallback.
947   */
sector_in_rbio(struct btrfs_raid_bio * rbio,int stripe_nr,int sector_nr,bool bio_list_only)948  static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
949  					 int stripe_nr, int sector_nr,
950  					 bool bio_list_only)
951  {
952  	struct sector_ptr *sector;
953  	int index;
954  
955  	ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes,
956  			   rbio, stripe_nr);
957  	ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
958  			   rbio, sector_nr);
959  
960  	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
961  	ASSERT(index >= 0 && index < rbio->nr_sectors);
962  
963  	spin_lock(&rbio->bio_list_lock);
964  	sector = &rbio->bio_sectors[index];
965  	if (sector->page || bio_list_only) {
966  		/* Don't return sector without a valid page pointer */
967  		if (!sector->page)
968  			sector = NULL;
969  		spin_unlock(&rbio->bio_list_lock);
970  		return sector;
971  	}
972  	spin_unlock(&rbio->bio_list_lock);
973  
974  	return &rbio->stripe_sectors[index];
975  }
976  
977  /*
978   * allocation and initial setup for the btrfs_raid_bio.  Not
979   * this does not allocate any pages for rbio->pages.
980   */
alloc_rbio(struct btrfs_fs_info * fs_info,struct btrfs_io_context * bioc)981  static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
982  					 struct btrfs_io_context *bioc)
983  {
984  	const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
985  	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
986  	const unsigned int num_pages = stripe_npages * real_stripes;
987  	const unsigned int stripe_nsectors =
988  		BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
989  	const unsigned int num_sectors = stripe_nsectors * real_stripes;
990  	struct btrfs_raid_bio *rbio;
991  
992  	/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
993  	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
994  	/*
995  	 * Our current stripe len should be fixed to 64k thus stripe_nsectors
996  	 * (at most 16) should be no larger than BITS_PER_LONG.
997  	 */
998  	ASSERT(stripe_nsectors <= BITS_PER_LONG);
999  
1000  	/*
1001  	 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
1002  	 * (limited by u8).
1003  	 */
1004  	ASSERT(real_stripes >= 2);
1005  	ASSERT(real_stripes <= U8_MAX);
1006  
1007  	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
1008  	if (!rbio)
1009  		return ERR_PTR(-ENOMEM);
1010  	rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
1011  				     GFP_NOFS);
1012  	rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
1013  				    GFP_NOFS);
1014  	rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
1015  				       GFP_NOFS);
1016  	rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
1017  	rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
1018  
1019  	if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
1020  	    !rbio->finish_pointers || !rbio->error_bitmap) {
1021  		free_raid_bio_pointers(rbio);
1022  		kfree(rbio);
1023  		return ERR_PTR(-ENOMEM);
1024  	}
1025  
1026  	bio_list_init(&rbio->bio_list);
1027  	init_waitqueue_head(&rbio->io_wait);
1028  	INIT_LIST_HEAD(&rbio->plug_list);
1029  	spin_lock_init(&rbio->bio_list_lock);
1030  	INIT_LIST_HEAD(&rbio->stripe_cache);
1031  	INIT_LIST_HEAD(&rbio->hash_list);
1032  	btrfs_get_bioc(bioc);
1033  	rbio->bioc = bioc;
1034  	rbio->nr_pages = num_pages;
1035  	rbio->nr_sectors = num_sectors;
1036  	rbio->real_stripes = real_stripes;
1037  	rbio->stripe_npages = stripe_npages;
1038  	rbio->stripe_nsectors = stripe_nsectors;
1039  	refcount_set(&rbio->refs, 1);
1040  	atomic_set(&rbio->stripes_pending, 0);
1041  
1042  	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
1043  	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
1044  	ASSERT(rbio->nr_data > 0);
1045  
1046  	return rbio;
1047  }
1048  
1049  /* allocate pages for all the stripes in the bio, including parity */
alloc_rbio_pages(struct btrfs_raid_bio * rbio)1050  static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1051  {
1052  	int ret;
1053  
1054  	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
1055  	if (ret < 0)
1056  		return ret;
1057  	/* Mapping all sectors */
1058  	index_stripe_sectors(rbio);
1059  	return 0;
1060  }
1061  
1062  /* only allocate pages for p/q stripes */
alloc_rbio_parity_pages(struct btrfs_raid_bio * rbio)1063  static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1064  {
1065  	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1066  	int ret;
1067  
1068  	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1069  				     rbio->stripe_pages + data_pages, false);
1070  	if (ret < 0)
1071  		return ret;
1072  
1073  	index_stripe_sectors(rbio);
1074  	return 0;
1075  }
1076  
1077  /*
1078   * Return the total number of errors found in the vertical stripe of @sector_nr.
1079   *
1080   * @faila and @failb will also be updated to the first and second stripe
1081   * number of the errors.
1082   */
get_rbio_veritical_errors(struct btrfs_raid_bio * rbio,int sector_nr,int * faila,int * failb)1083  static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
1084  				     int *faila, int *failb)
1085  {
1086  	int stripe_nr;
1087  	int found_errors = 0;
1088  
1089  	if (faila || failb) {
1090  		/*
1091  		 * Both @faila and @failb should be valid pointers if any of
1092  		 * them is specified.
1093  		 */
1094  		ASSERT(faila && failb);
1095  		*faila = -1;
1096  		*failb = -1;
1097  	}
1098  
1099  	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1100  		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1101  
1102  		if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1103  			found_errors++;
1104  			if (faila) {
1105  				/* Update faila and failb. */
1106  				if (*faila < 0)
1107  					*faila = stripe_nr;
1108  				else if (*failb < 0)
1109  					*failb = stripe_nr;
1110  			}
1111  		}
1112  	}
1113  	return found_errors;
1114  }
1115  
1116  /*
1117   * Add a single sector @sector into our list of bios for IO.
1118   *
1119   * Return 0 if everything went well.
1120   * Return <0 for error.
1121   */
rbio_add_io_sector(struct btrfs_raid_bio * rbio,struct bio_list * bio_list,struct sector_ptr * sector,unsigned int stripe_nr,unsigned int sector_nr,enum req_op op)1122  static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1123  			      struct bio_list *bio_list,
1124  			      struct sector_ptr *sector,
1125  			      unsigned int stripe_nr,
1126  			      unsigned int sector_nr,
1127  			      enum req_op op)
1128  {
1129  	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1130  	struct bio *last = bio_list->tail;
1131  	int ret;
1132  	struct bio *bio;
1133  	struct btrfs_io_stripe *stripe;
1134  	u64 disk_start;
1135  
1136  	/*
1137  	 * Note: here stripe_nr has taken device replace into consideration,
1138  	 * thus it can be larger than rbio->real_stripe.
1139  	 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1140  	 */
1141  	ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
1142  			   rbio, stripe_nr);
1143  	ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
1144  			   rbio, sector_nr);
1145  	ASSERT(sector->page);
1146  
1147  	stripe = &rbio->bioc->stripes[stripe_nr];
1148  	disk_start = stripe->physical + sector_nr * sectorsize;
1149  
1150  	/* if the device is missing, just fail this stripe */
1151  	if (!stripe->dev->bdev) {
1152  		int found_errors;
1153  
1154  		set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1155  			rbio->error_bitmap);
1156  
1157  		/* Check if we have reached tolerance early. */
1158  		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
1159  							 NULL, NULL);
1160  		if (found_errors > rbio->bioc->max_errors)
1161  			return -EIO;
1162  		return 0;
1163  	}
1164  
1165  	/* see if we can add this page onto our existing bio */
1166  	if (last) {
1167  		u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1168  		last_end += last->bi_iter.bi_size;
1169  
1170  		/*
1171  		 * we can't merge these if they are from different
1172  		 * devices or if they are not contiguous
1173  		 */
1174  		if (last_end == disk_start && !last->bi_status &&
1175  		    last->bi_bdev == stripe->dev->bdev) {
1176  			ret = bio_add_page(last, sector->page, sectorsize,
1177  					   sector->pgoff);
1178  			if (ret == sectorsize)
1179  				return 0;
1180  		}
1181  	}
1182  
1183  	/* put a new bio on the list */
1184  	bio = bio_alloc(stripe->dev->bdev,
1185  			max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1186  			op, GFP_NOFS);
1187  	bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1188  	bio->bi_private = rbio;
1189  
1190  	__bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
1191  	bio_list_add(bio_list, bio);
1192  	return 0;
1193  }
1194  
index_one_bio(struct btrfs_raid_bio * rbio,struct bio * bio)1195  static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1196  {
1197  	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1198  	struct bio_vec bvec;
1199  	struct bvec_iter iter;
1200  	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1201  		     rbio->bioc->full_stripe_logical;
1202  
1203  	bio_for_each_segment(bvec, bio, iter) {
1204  		u32 bvec_offset;
1205  
1206  		for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1207  		     bvec_offset += sectorsize, offset += sectorsize) {
1208  			int index = offset / sectorsize;
1209  			struct sector_ptr *sector = &rbio->bio_sectors[index];
1210  
1211  			sector->page = bvec.bv_page;
1212  			sector->pgoff = bvec.bv_offset + bvec_offset;
1213  			ASSERT(sector->pgoff < PAGE_SIZE);
1214  		}
1215  	}
1216  }
1217  
1218  /*
1219   * helper function to walk our bio list and populate the bio_pages array with
1220   * the result.  This seems expensive, but it is faster than constantly
1221   * searching through the bio list as we setup the IO in finish_rmw or stripe
1222   * reconstruction.
1223   *
1224   * This must be called before you trust the answers from page_in_rbio
1225   */
index_rbio_pages(struct btrfs_raid_bio * rbio)1226  static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1227  {
1228  	struct bio *bio;
1229  
1230  	spin_lock(&rbio->bio_list_lock);
1231  	bio_list_for_each(bio, &rbio->bio_list)
1232  		index_one_bio(rbio, bio);
1233  
1234  	spin_unlock(&rbio->bio_list_lock);
1235  }
1236  
bio_get_trace_info(struct btrfs_raid_bio * rbio,struct bio * bio,struct raid56_bio_trace_info * trace_info)1237  static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1238  			       struct raid56_bio_trace_info *trace_info)
1239  {
1240  	const struct btrfs_io_context *bioc = rbio->bioc;
1241  	int i;
1242  
1243  	ASSERT(bioc);
1244  
1245  	/* We rely on bio->bi_bdev to find the stripe number. */
1246  	if (!bio->bi_bdev)
1247  		goto not_found;
1248  
1249  	for (i = 0; i < bioc->num_stripes; i++) {
1250  		if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1251  			continue;
1252  		trace_info->stripe_nr = i;
1253  		trace_info->devid = bioc->stripes[i].dev->devid;
1254  		trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1255  				     bioc->stripes[i].physical;
1256  		return;
1257  	}
1258  
1259  not_found:
1260  	trace_info->devid = -1;
1261  	trace_info->offset = -1;
1262  	trace_info->stripe_nr = -1;
1263  }
1264  
bio_list_put(struct bio_list * bio_list)1265  static inline void bio_list_put(struct bio_list *bio_list)
1266  {
1267  	struct bio *bio;
1268  
1269  	while ((bio = bio_list_pop(bio_list)))
1270  		bio_put(bio);
1271  }
1272  
assert_rbio(struct btrfs_raid_bio * rbio)1273  static void assert_rbio(struct btrfs_raid_bio *rbio)
1274  {
1275  	if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
1276  	    !IS_ENABLED(CONFIG_BTRFS_ASSERT))
1277  		return;
1278  
1279  	/*
1280  	 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
1281  	 * we won't go beyond 256 disks anyway.
1282  	 */
1283  	ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
1284  	ASSERT_RBIO(rbio->nr_data > 0, rbio);
1285  
1286  	/*
1287  	 * This is another check to make sure nr data stripes is smaller
1288  	 * than total stripes.
1289  	 */
1290  	ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
1291  }
1292  
1293  /* Generate PQ for one vertical stripe. */
generate_pq_vertical(struct btrfs_raid_bio * rbio,int sectornr)1294  static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1295  {
1296  	void **pointers = rbio->finish_pointers;
1297  	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1298  	struct sector_ptr *sector;
1299  	int stripe;
1300  	const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1301  
1302  	/* First collect one sector from each data stripe */
1303  	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1304  		sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1305  		pointers[stripe] = kmap_local_page(sector->page) +
1306  				   sector->pgoff;
1307  	}
1308  
1309  	/* Then add the parity stripe */
1310  	sector = rbio_pstripe_sector(rbio, sectornr);
1311  	sector->uptodate = 1;
1312  	pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
1313  
1314  	if (has_qstripe) {
1315  		/*
1316  		 * RAID6, add the qstripe and call the library function
1317  		 * to fill in our p/q
1318  		 */
1319  		sector = rbio_qstripe_sector(rbio, sectornr);
1320  		sector->uptodate = 1;
1321  		pointers[stripe++] = kmap_local_page(sector->page) +
1322  				     sector->pgoff;
1323  
1324  		assert_rbio(rbio);
1325  		raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1326  					pointers);
1327  	} else {
1328  		/* raid5 */
1329  		memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
1330  		run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
1331  	}
1332  	for (stripe = stripe - 1; stripe >= 0; stripe--)
1333  		kunmap_local(pointers[stripe]);
1334  }
1335  
rmw_assemble_write_bios(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)1336  static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1337  				   struct bio_list *bio_list)
1338  {
1339  	/* The total sector number inside the full stripe. */
1340  	int total_sector_nr;
1341  	int sectornr;
1342  	int stripe;
1343  	int ret;
1344  
1345  	ASSERT(bio_list_size(bio_list) == 0);
1346  
1347  	/* We should have at least one data sector. */
1348  	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1349  
1350  	/*
1351  	 * Reset errors, as we may have errors inherited from from degraded
1352  	 * write.
1353  	 */
1354  	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
1355  
1356  	/*
1357  	 * Start assembly.  Make bios for everything from the higher layers (the
1358  	 * bio_list in our rbio) and our P/Q.  Ignore everything else.
1359  	 */
1360  	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1361  	     total_sector_nr++) {
1362  		struct sector_ptr *sector;
1363  
1364  		stripe = total_sector_nr / rbio->stripe_nsectors;
1365  		sectornr = total_sector_nr % rbio->stripe_nsectors;
1366  
1367  		/* This vertical stripe has no data, skip it. */
1368  		if (!test_bit(sectornr, &rbio->dbitmap))
1369  			continue;
1370  
1371  		if (stripe < rbio->nr_data) {
1372  			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1373  			if (!sector)
1374  				continue;
1375  		} else {
1376  			sector = rbio_stripe_sector(rbio, stripe, sectornr);
1377  		}
1378  
1379  		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
1380  					 sectornr, REQ_OP_WRITE);
1381  		if (ret)
1382  			goto error;
1383  	}
1384  
1385  	if (likely(!rbio->bioc->replace_nr_stripes))
1386  		return 0;
1387  
1388  	/*
1389  	 * Make a copy for the replace target device.
1390  	 *
1391  	 * Thus the source stripe number (in replace_stripe_src) should be valid.
1392  	 */
1393  	ASSERT(rbio->bioc->replace_stripe_src >= 0);
1394  
1395  	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1396  	     total_sector_nr++) {
1397  		struct sector_ptr *sector;
1398  
1399  		stripe = total_sector_nr / rbio->stripe_nsectors;
1400  		sectornr = total_sector_nr % rbio->stripe_nsectors;
1401  
1402  		/*
1403  		 * For RAID56, there is only one device that can be replaced,
1404  		 * and replace_stripe_src[0] indicates the stripe number we
1405  		 * need to copy from.
1406  		 */
1407  		if (stripe != rbio->bioc->replace_stripe_src) {
1408  			/*
1409  			 * We can skip the whole stripe completely, note
1410  			 * total_sector_nr will be increased by one anyway.
1411  			 */
1412  			ASSERT(sectornr == 0);
1413  			total_sector_nr += rbio->stripe_nsectors - 1;
1414  			continue;
1415  		}
1416  
1417  		/* This vertical stripe has no data, skip it. */
1418  		if (!test_bit(sectornr, &rbio->dbitmap))
1419  			continue;
1420  
1421  		if (stripe < rbio->nr_data) {
1422  			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1423  			if (!sector)
1424  				continue;
1425  		} else {
1426  			sector = rbio_stripe_sector(rbio, stripe, sectornr);
1427  		}
1428  
1429  		ret = rbio_add_io_sector(rbio, bio_list, sector,
1430  					 rbio->real_stripes,
1431  					 sectornr, REQ_OP_WRITE);
1432  		if (ret)
1433  			goto error;
1434  	}
1435  
1436  	return 0;
1437  error:
1438  	bio_list_put(bio_list);
1439  	return -EIO;
1440  }
1441  
set_rbio_range_error(struct btrfs_raid_bio * rbio,struct bio * bio)1442  static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1443  {
1444  	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1445  	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1446  		     rbio->bioc->full_stripe_logical;
1447  	int total_nr_sector = offset >> fs_info->sectorsize_bits;
1448  
1449  	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1450  
1451  	bitmap_set(rbio->error_bitmap, total_nr_sector,
1452  		   bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1453  
1454  	/*
1455  	 * Special handling for raid56_alloc_missing_rbio() used by
1456  	 * scrub/replace.  Unlike call path in raid56_parity_recover(), they
1457  	 * pass an empty bio here.  Thus we have to find out the missing device
1458  	 * and mark the stripe error instead.
1459  	 */
1460  	if (bio->bi_iter.bi_size == 0) {
1461  		bool found_missing = false;
1462  		int stripe_nr;
1463  
1464  		for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1465  			if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1466  				found_missing = true;
1467  				bitmap_set(rbio->error_bitmap,
1468  					   stripe_nr * rbio->stripe_nsectors,
1469  					   rbio->stripe_nsectors);
1470  			}
1471  		}
1472  		ASSERT(found_missing);
1473  	}
1474  }
1475  
1476  /*
1477   * For subpage case, we can no longer set page Up-to-date directly for
1478   * stripe_pages[], thus we need to locate the sector.
1479   */
find_stripe_sector(struct btrfs_raid_bio * rbio,struct page * page,unsigned int pgoff)1480  static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1481  					     struct page *page,
1482  					     unsigned int pgoff)
1483  {
1484  	int i;
1485  
1486  	for (i = 0; i < rbio->nr_sectors; i++) {
1487  		struct sector_ptr *sector = &rbio->stripe_sectors[i];
1488  
1489  		if (sector->page == page && sector->pgoff == pgoff)
1490  			return sector;
1491  	}
1492  	return NULL;
1493  }
1494  
1495  /*
1496   * this sets each page in the bio uptodate.  It should only be used on private
1497   * rbio pages, nothing that comes in from the higher layers
1498   */
set_bio_pages_uptodate(struct btrfs_raid_bio * rbio,struct bio * bio)1499  static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1500  {
1501  	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1502  	struct bio_vec *bvec;
1503  	struct bvec_iter_all iter_all;
1504  
1505  	ASSERT(!bio_flagged(bio, BIO_CLONED));
1506  
1507  	bio_for_each_segment_all(bvec, bio, iter_all) {
1508  		struct sector_ptr *sector;
1509  		int pgoff;
1510  
1511  		for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1512  		     pgoff += sectorsize) {
1513  			sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1514  			ASSERT(sector);
1515  			if (sector)
1516  				sector->uptodate = 1;
1517  		}
1518  	}
1519  }
1520  
get_bio_sector_nr(struct btrfs_raid_bio * rbio,struct bio * bio)1521  static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1522  {
1523  	struct bio_vec *bv = bio_first_bvec_all(bio);
1524  	int i;
1525  
1526  	for (i = 0; i < rbio->nr_sectors; i++) {
1527  		struct sector_ptr *sector;
1528  
1529  		sector = &rbio->stripe_sectors[i];
1530  		if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1531  			break;
1532  		sector = &rbio->bio_sectors[i];
1533  		if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1534  			break;
1535  	}
1536  	ASSERT(i < rbio->nr_sectors);
1537  	return i;
1538  }
1539  
rbio_update_error_bitmap(struct btrfs_raid_bio * rbio,struct bio * bio)1540  static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1541  {
1542  	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1543  	u32 bio_size = 0;
1544  	struct bio_vec *bvec;
1545  	int i;
1546  
1547  	bio_for_each_bvec_all(bvec, bio, i)
1548  		bio_size += bvec->bv_len;
1549  
1550  	/*
1551  	 * Since we can have multiple bios touching the error_bitmap, we cannot
1552  	 * call bitmap_set() without protection.
1553  	 *
1554  	 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1555  	 */
1556  	for (i = total_sector_nr; i < total_sector_nr +
1557  	     (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1558  		set_bit(i, rbio->error_bitmap);
1559  }
1560  
1561  /* Verify the data sectors at read time. */
verify_bio_data_sectors(struct btrfs_raid_bio * rbio,struct bio * bio)1562  static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1563  				    struct bio *bio)
1564  {
1565  	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1566  	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1567  	struct bio_vec *bvec;
1568  	struct bvec_iter_all iter_all;
1569  
1570  	/* No data csum for the whole stripe, no need to verify. */
1571  	if (!rbio->csum_bitmap || !rbio->csum_buf)
1572  		return;
1573  
1574  	/* P/Q stripes, they have no data csum to verify against. */
1575  	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1576  		return;
1577  
1578  	bio_for_each_segment_all(bvec, bio, iter_all) {
1579  		int bv_offset;
1580  
1581  		for (bv_offset = bvec->bv_offset;
1582  		     bv_offset < bvec->bv_offset + bvec->bv_len;
1583  		     bv_offset += fs_info->sectorsize, total_sector_nr++) {
1584  			u8 csum_buf[BTRFS_CSUM_SIZE];
1585  			u8 *expected_csum = rbio->csum_buf +
1586  					    total_sector_nr * fs_info->csum_size;
1587  			int ret;
1588  
1589  			/* No csum for this sector, skip to the next sector. */
1590  			if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1591  				continue;
1592  
1593  			ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
1594  				bv_offset, csum_buf, expected_csum);
1595  			if (ret < 0)
1596  				set_bit(total_sector_nr, rbio->error_bitmap);
1597  		}
1598  	}
1599  }
1600  
raid_wait_read_end_io(struct bio * bio)1601  static void raid_wait_read_end_io(struct bio *bio)
1602  {
1603  	struct btrfs_raid_bio *rbio = bio->bi_private;
1604  
1605  	if (bio->bi_status) {
1606  		rbio_update_error_bitmap(rbio, bio);
1607  	} else {
1608  		set_bio_pages_uptodate(rbio, bio);
1609  		verify_bio_data_sectors(rbio, bio);
1610  	}
1611  
1612  	bio_put(bio);
1613  	if (atomic_dec_and_test(&rbio->stripes_pending))
1614  		wake_up(&rbio->io_wait);
1615  }
1616  
submit_read_wait_bio_list(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)1617  static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1618  			     struct bio_list *bio_list)
1619  {
1620  	struct bio *bio;
1621  
1622  	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1623  	while ((bio = bio_list_pop(bio_list))) {
1624  		bio->bi_end_io = raid_wait_read_end_io;
1625  
1626  		if (trace_raid56_read_enabled()) {
1627  			struct raid56_bio_trace_info trace_info = { 0 };
1628  
1629  			bio_get_trace_info(rbio, bio, &trace_info);
1630  			trace_raid56_read(rbio, bio, &trace_info);
1631  		}
1632  		submit_bio(bio);
1633  	}
1634  
1635  	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
1636  }
1637  
alloc_rbio_data_pages(struct btrfs_raid_bio * rbio)1638  static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1639  {
1640  	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1641  	int ret;
1642  
1643  	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
1644  	if (ret < 0)
1645  		return ret;
1646  
1647  	index_stripe_sectors(rbio);
1648  	return 0;
1649  }
1650  
1651  /*
1652   * We use plugging call backs to collect full stripes.
1653   * Any time we get a partial stripe write while plugged
1654   * we collect it into a list.  When the unplug comes down,
1655   * we sort the list by logical block number and merge
1656   * everything we can into the same rbios
1657   */
1658  struct btrfs_plug_cb {
1659  	struct blk_plug_cb cb;
1660  	struct btrfs_fs_info *info;
1661  	struct list_head rbio_list;
1662  };
1663  
1664  /*
1665   * rbios on the plug list are sorted for easier merging.
1666   */
plug_cmp(void * priv,const struct list_head * a,const struct list_head * b)1667  static int plug_cmp(void *priv, const struct list_head *a,
1668  		    const struct list_head *b)
1669  {
1670  	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1671  						       plug_list);
1672  	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1673  						       plug_list);
1674  	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1675  	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1676  
1677  	if (a_sector < b_sector)
1678  		return -1;
1679  	if (a_sector > b_sector)
1680  		return 1;
1681  	return 0;
1682  }
1683  
raid_unplug(struct blk_plug_cb * cb,bool from_schedule)1684  static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1685  {
1686  	struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
1687  	struct btrfs_raid_bio *cur;
1688  	struct btrfs_raid_bio *last = NULL;
1689  
1690  	list_sort(NULL, &plug->rbio_list, plug_cmp);
1691  
1692  	while (!list_empty(&plug->rbio_list)) {
1693  		cur = list_entry(plug->rbio_list.next,
1694  				 struct btrfs_raid_bio, plug_list);
1695  		list_del_init(&cur->plug_list);
1696  
1697  		if (rbio_is_full(cur)) {
1698  			/* We have a full stripe, queue it down. */
1699  			start_async_work(cur, rmw_rbio_work);
1700  			continue;
1701  		}
1702  		if (last) {
1703  			if (rbio_can_merge(last, cur)) {
1704  				merge_rbio(last, cur);
1705  				free_raid_bio(cur);
1706  				continue;
1707  			}
1708  			start_async_work(last, rmw_rbio_work);
1709  		}
1710  		last = cur;
1711  	}
1712  	if (last)
1713  		start_async_work(last, rmw_rbio_work);
1714  	kfree(plug);
1715  }
1716  
1717  /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
rbio_add_bio(struct btrfs_raid_bio * rbio,struct bio * orig_bio)1718  static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1719  {
1720  	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1721  	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1722  	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1723  	const u32 orig_len = orig_bio->bi_iter.bi_size;
1724  	const u32 sectorsize = fs_info->sectorsize;
1725  	u64 cur_logical;
1726  
1727  	ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
1728  			    orig_logical + orig_len <= full_stripe_start +
1729  			    rbio->nr_data * BTRFS_STRIPE_LEN,
1730  			    rbio, orig_logical);
1731  
1732  	bio_list_add(&rbio->bio_list, orig_bio);
1733  	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1734  
1735  	/* Update the dbitmap. */
1736  	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1737  	     cur_logical += sectorsize) {
1738  		int bit = ((u32)(cur_logical - full_stripe_start) >>
1739  			   fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1740  
1741  		set_bit(bit, &rbio->dbitmap);
1742  	}
1743  }
1744  
1745  /*
1746   * our main entry point for writes from the rest of the FS.
1747   */
raid56_parity_write(struct bio * bio,struct btrfs_io_context * bioc)1748  void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1749  {
1750  	struct btrfs_fs_info *fs_info = bioc->fs_info;
1751  	struct btrfs_raid_bio *rbio;
1752  	struct btrfs_plug_cb *plug = NULL;
1753  	struct blk_plug_cb *cb;
1754  
1755  	rbio = alloc_rbio(fs_info, bioc);
1756  	if (IS_ERR(rbio)) {
1757  		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
1758  		bio_endio(bio);
1759  		return;
1760  	}
1761  	rbio->operation = BTRFS_RBIO_WRITE;
1762  	rbio_add_bio(rbio, bio);
1763  
1764  	/*
1765  	 * Don't plug on full rbios, just get them out the door
1766  	 * as quickly as we can
1767  	 */
1768  	if (!rbio_is_full(rbio)) {
1769  		cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1770  		if (cb) {
1771  			plug = container_of(cb, struct btrfs_plug_cb, cb);
1772  			if (!plug->info) {
1773  				plug->info = fs_info;
1774  				INIT_LIST_HEAD(&plug->rbio_list);
1775  			}
1776  			list_add_tail(&rbio->plug_list, &plug->rbio_list);
1777  			return;
1778  		}
1779  	}
1780  
1781  	/*
1782  	 * Either we don't have any existing plug, or we're doing a full stripe,
1783  	 * queue the rmw work now.
1784  	 */
1785  	start_async_work(rbio, rmw_rbio_work);
1786  }
1787  
verify_one_sector(struct btrfs_raid_bio * rbio,int stripe_nr,int sector_nr)1788  static int verify_one_sector(struct btrfs_raid_bio *rbio,
1789  			     int stripe_nr, int sector_nr)
1790  {
1791  	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1792  	struct sector_ptr *sector;
1793  	u8 csum_buf[BTRFS_CSUM_SIZE];
1794  	u8 *csum_expected;
1795  	int ret;
1796  
1797  	if (!rbio->csum_bitmap || !rbio->csum_buf)
1798  		return 0;
1799  
1800  	/* No way to verify P/Q as they are not covered by data csum. */
1801  	if (stripe_nr >= rbio->nr_data)
1802  		return 0;
1803  	/*
1804  	 * If we're rebuilding a read, we have to use pages from the
1805  	 * bio list if possible.
1806  	 */
1807  	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1808  		sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1809  	} else {
1810  		sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1811  	}
1812  
1813  	ASSERT(sector->page);
1814  
1815  	csum_expected = rbio->csum_buf +
1816  			(stripe_nr * rbio->stripe_nsectors + sector_nr) *
1817  			fs_info->csum_size;
1818  	ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
1819  				      csum_buf, csum_expected);
1820  	return ret;
1821  }
1822  
1823  /*
1824   * Recover a vertical stripe specified by @sector_nr.
1825   * @*pointers are the pre-allocated pointers by the caller, so we don't
1826   * need to allocate/free the pointers again and again.
1827   */
recover_vertical(struct btrfs_raid_bio * rbio,int sector_nr,void ** pointers,void ** unmap_array)1828  static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
1829  			    void **pointers, void **unmap_array)
1830  {
1831  	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1832  	struct sector_ptr *sector;
1833  	const u32 sectorsize = fs_info->sectorsize;
1834  	int found_errors;
1835  	int faila;
1836  	int failb;
1837  	int stripe_nr;
1838  	int ret = 0;
1839  
1840  	/*
1841  	 * Now we just use bitmap to mark the horizontal stripes in
1842  	 * which we have data when doing parity scrub.
1843  	 */
1844  	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1845  	    !test_bit(sector_nr, &rbio->dbitmap))
1846  		return 0;
1847  
1848  	found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
1849  						 &failb);
1850  	/*
1851  	 * No errors in the vertical stripe, skip it.  Can happen for recovery
1852  	 * which only part of a stripe failed csum check.
1853  	 */
1854  	if (!found_errors)
1855  		return 0;
1856  
1857  	if (found_errors > rbio->bioc->max_errors)
1858  		return -EIO;
1859  
1860  	/*
1861  	 * Setup our array of pointers with sectors from each stripe
1862  	 *
1863  	 * NOTE: store a duplicate array of pointers to preserve the
1864  	 * pointer order.
1865  	 */
1866  	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1867  		/*
1868  		 * If we're rebuilding a read, we have to use pages from the
1869  		 * bio list if possible.
1870  		 */
1871  		if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1872  			sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1873  		} else {
1874  			sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1875  		}
1876  		ASSERT(sector->page);
1877  		pointers[stripe_nr] = kmap_local_page(sector->page) +
1878  				   sector->pgoff;
1879  		unmap_array[stripe_nr] = pointers[stripe_nr];
1880  	}
1881  
1882  	/* All raid6 handling here */
1883  	if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1884  		/* Single failure, rebuild from parity raid5 style */
1885  		if (failb < 0) {
1886  			if (faila == rbio->nr_data)
1887  				/*
1888  				 * Just the P stripe has failed, without
1889  				 * a bad data or Q stripe.
1890  				 * We have nothing to do, just skip the
1891  				 * recovery for this stripe.
1892  				 */
1893  				goto cleanup;
1894  			/*
1895  			 * a single failure in raid6 is rebuilt
1896  			 * in the pstripe code below
1897  			 */
1898  			goto pstripe;
1899  		}
1900  
1901  		/*
1902  		 * If the q stripe is failed, do a pstripe reconstruction from
1903  		 * the xors.
1904  		 * If both the q stripe and the P stripe are failed, we're
1905  		 * here due to a crc mismatch and we can't give them the
1906  		 * data they want.
1907  		 */
1908  		if (failb == rbio->real_stripes - 1) {
1909  			if (faila == rbio->real_stripes - 2)
1910  				/*
1911  				 * Only P and Q are corrupted.
1912  				 * We only care about data stripes recovery,
1913  				 * can skip this vertical stripe.
1914  				 */
1915  				goto cleanup;
1916  			/*
1917  			 * Otherwise we have one bad data stripe and
1918  			 * a good P stripe.  raid5!
1919  			 */
1920  			goto pstripe;
1921  		}
1922  
1923  		if (failb == rbio->real_stripes - 2) {
1924  			raid6_datap_recov(rbio->real_stripes, sectorsize,
1925  					  faila, pointers);
1926  		} else {
1927  			raid6_2data_recov(rbio->real_stripes, sectorsize,
1928  					  faila, failb, pointers);
1929  		}
1930  	} else {
1931  		void *p;
1932  
1933  		/* Rebuild from P stripe here (raid5 or raid6). */
1934  		ASSERT(failb == -1);
1935  pstripe:
1936  		/* Copy parity block into failed block to start with */
1937  		memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1938  
1939  		/* Rearrange the pointer array */
1940  		p = pointers[faila];
1941  		for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
1942  		     stripe_nr++)
1943  			pointers[stripe_nr] = pointers[stripe_nr + 1];
1944  		pointers[rbio->nr_data - 1] = p;
1945  
1946  		/* Xor in the rest */
1947  		run_xor(pointers, rbio->nr_data - 1, sectorsize);
1948  
1949  	}
1950  
1951  	/*
1952  	 * No matter if this is a RMW or recovery, we should have all
1953  	 * failed sectors repaired in the vertical stripe, thus they are now
1954  	 * uptodate.
1955  	 * Especially if we determine to cache the rbio, we need to
1956  	 * have at least all data sectors uptodate.
1957  	 *
1958  	 * If possible, also check if the repaired sector matches its data
1959  	 * checksum.
1960  	 */
1961  	if (faila >= 0) {
1962  		ret = verify_one_sector(rbio, faila, sector_nr);
1963  		if (ret < 0)
1964  			goto cleanup;
1965  
1966  		sector = rbio_stripe_sector(rbio, faila, sector_nr);
1967  		sector->uptodate = 1;
1968  	}
1969  	if (failb >= 0) {
1970  		ret = verify_one_sector(rbio, failb, sector_nr);
1971  		if (ret < 0)
1972  			goto cleanup;
1973  
1974  		sector = rbio_stripe_sector(rbio, failb, sector_nr);
1975  		sector->uptodate = 1;
1976  	}
1977  
1978  cleanup:
1979  	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
1980  		kunmap_local(unmap_array[stripe_nr]);
1981  	return ret;
1982  }
1983  
recover_sectors(struct btrfs_raid_bio * rbio)1984  static int recover_sectors(struct btrfs_raid_bio *rbio)
1985  {
1986  	void **pointers = NULL;
1987  	void **unmap_array = NULL;
1988  	int sectornr;
1989  	int ret = 0;
1990  
1991  	/*
1992  	 * @pointers array stores the pointer for each sector.
1993  	 *
1994  	 * @unmap_array stores copy of pointers that does not get reordered
1995  	 * during reconstruction so that kunmap_local works.
1996  	 */
1997  	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1998  	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1999  	if (!pointers || !unmap_array) {
2000  		ret = -ENOMEM;
2001  		goto out;
2002  	}
2003  
2004  	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
2005  		spin_lock(&rbio->bio_list_lock);
2006  		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2007  		spin_unlock(&rbio->bio_list_lock);
2008  	}
2009  
2010  	index_rbio_pages(rbio);
2011  
2012  	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2013  		ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
2014  		if (ret < 0)
2015  			break;
2016  	}
2017  
2018  out:
2019  	kfree(pointers);
2020  	kfree(unmap_array);
2021  	return ret;
2022  }
2023  
recover_rbio(struct btrfs_raid_bio * rbio)2024  static void recover_rbio(struct btrfs_raid_bio *rbio)
2025  {
2026  	struct bio_list bio_list = BIO_EMPTY_LIST;
2027  	int total_sector_nr;
2028  	int ret = 0;
2029  
2030  	/*
2031  	 * Either we're doing recover for a read failure or degraded write,
2032  	 * caller should have set error bitmap correctly.
2033  	 */
2034  	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
2035  
2036  	/* For recovery, we need to read all sectors including P/Q. */
2037  	ret = alloc_rbio_pages(rbio);
2038  	if (ret < 0)
2039  		goto out;
2040  
2041  	index_rbio_pages(rbio);
2042  
2043  	/*
2044  	 * Read everything that hasn't failed. However this time we will
2045  	 * not trust any cached sector.
2046  	 * As we may read out some stale data but higher layer is not reading
2047  	 * that stale part.
2048  	 *
2049  	 * So here we always re-read everything in recovery path.
2050  	 */
2051  	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2052  	     total_sector_nr++) {
2053  		int stripe = total_sector_nr / rbio->stripe_nsectors;
2054  		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2055  		struct sector_ptr *sector;
2056  
2057  		/*
2058  		 * Skip the range which has error.  It can be a range which is
2059  		 * marked error (for csum mismatch), or it can be a missing
2060  		 * device.
2061  		 */
2062  		if (!rbio->bioc->stripes[stripe].dev->bdev ||
2063  		    test_bit(total_sector_nr, rbio->error_bitmap)) {
2064  			/*
2065  			 * Also set the error bit for missing device, which
2066  			 * may not yet have its error bit set.
2067  			 */
2068  			set_bit(total_sector_nr, rbio->error_bitmap);
2069  			continue;
2070  		}
2071  
2072  		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2073  		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2074  					 sectornr, REQ_OP_READ);
2075  		if (ret < 0) {
2076  			bio_list_put(&bio_list);
2077  			goto out;
2078  		}
2079  	}
2080  
2081  	submit_read_wait_bio_list(rbio, &bio_list);
2082  	ret = recover_sectors(rbio);
2083  out:
2084  	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2085  }
2086  
recover_rbio_work(struct work_struct * work)2087  static void recover_rbio_work(struct work_struct *work)
2088  {
2089  	struct btrfs_raid_bio *rbio;
2090  
2091  	rbio = container_of(work, struct btrfs_raid_bio, work);
2092  	if (!lock_stripe_add(rbio))
2093  		recover_rbio(rbio);
2094  }
2095  
recover_rbio_work_locked(struct work_struct * work)2096  static void recover_rbio_work_locked(struct work_struct *work)
2097  {
2098  	recover_rbio(container_of(work, struct btrfs_raid_bio, work));
2099  }
2100  
set_rbio_raid6_extra_error(struct btrfs_raid_bio * rbio,int mirror_num)2101  static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
2102  {
2103  	bool found = false;
2104  	int sector_nr;
2105  
2106  	/*
2107  	 * This is for RAID6 extra recovery tries, thus mirror number should
2108  	 * be large than 2.
2109  	 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2110  	 * RAID5 methods.
2111  	 */
2112  	ASSERT(mirror_num > 2);
2113  	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2114  		int found_errors;
2115  		int faila;
2116  		int failb;
2117  
2118  		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2119  							 &faila, &failb);
2120  		/* This vertical stripe doesn't have errors. */
2121  		if (!found_errors)
2122  			continue;
2123  
2124  		/*
2125  		 * If we found errors, there should be only one error marked
2126  		 * by previous set_rbio_range_error().
2127  		 */
2128  		ASSERT(found_errors == 1);
2129  		found = true;
2130  
2131  		/* Now select another stripe to mark as error. */
2132  		failb = rbio->real_stripes - (mirror_num - 1);
2133  		if (failb <= faila)
2134  			failb--;
2135  
2136  		/* Set the extra bit in error bitmap. */
2137  		if (failb >= 0)
2138  			set_bit(failb * rbio->stripe_nsectors + sector_nr,
2139  				rbio->error_bitmap);
2140  	}
2141  
2142  	/* We should found at least one vertical stripe with error.*/
2143  	ASSERT(found);
2144  }
2145  
2146  /*
2147   * the main entry point for reads from the higher layers.  This
2148   * is really only called when the normal read path had a failure,
2149   * so we assume the bio they send down corresponds to a failed part
2150   * of the drive.
2151   */
raid56_parity_recover(struct bio * bio,struct btrfs_io_context * bioc,int mirror_num)2152  void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2153  			   int mirror_num)
2154  {
2155  	struct btrfs_fs_info *fs_info = bioc->fs_info;
2156  	struct btrfs_raid_bio *rbio;
2157  
2158  	rbio = alloc_rbio(fs_info, bioc);
2159  	if (IS_ERR(rbio)) {
2160  		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2161  		bio_endio(bio);
2162  		return;
2163  	}
2164  
2165  	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2166  	rbio_add_bio(rbio, bio);
2167  
2168  	set_rbio_range_error(rbio, bio);
2169  
2170  	/*
2171  	 * Loop retry:
2172  	 * for 'mirror == 2', reconstruct from all other stripes.
2173  	 * for 'mirror_num > 2', select a stripe to fail on every retry.
2174  	 */
2175  	if (mirror_num > 2)
2176  		set_rbio_raid6_extra_error(rbio, mirror_num);
2177  
2178  	start_async_work(rbio, recover_rbio_work);
2179  }
2180  
fill_data_csums(struct btrfs_raid_bio * rbio)2181  static void fill_data_csums(struct btrfs_raid_bio *rbio)
2182  {
2183  	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2184  	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2185  						       rbio->bioc->full_stripe_logical);
2186  	const u64 start = rbio->bioc->full_stripe_logical;
2187  	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2188  			fs_info->sectorsize_bits;
2189  	int ret;
2190  
2191  	/* The rbio should not have its csum buffer initialized. */
2192  	ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2193  
2194  	/*
2195  	 * Skip the csum search if:
2196  	 *
2197  	 * - The rbio doesn't belong to data block groups
2198  	 *   Then we are doing IO for tree blocks, no need to search csums.
2199  	 *
2200  	 * - The rbio belongs to mixed block groups
2201  	 *   This is to avoid deadlock, as we're already holding the full
2202  	 *   stripe lock, if we trigger a metadata read, and it needs to do
2203  	 *   raid56 recovery, we will deadlock.
2204  	 */
2205  	if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2206  	    rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2207  		return;
2208  
2209  	rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2210  				 fs_info->csum_size, GFP_NOFS);
2211  	rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2212  					  GFP_NOFS);
2213  	if (!rbio->csum_buf || !rbio->csum_bitmap) {
2214  		ret = -ENOMEM;
2215  		goto error;
2216  	}
2217  
2218  	ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
2219  					rbio->csum_buf, rbio->csum_bitmap);
2220  	if (ret < 0)
2221  		goto error;
2222  	if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2223  		goto no_csum;
2224  	return;
2225  
2226  error:
2227  	/*
2228  	 * We failed to allocate memory or grab the csum, but it's not fatal,
2229  	 * we can still continue.  But better to warn users that RMW is no
2230  	 * longer safe for this particular sub-stripe write.
2231  	 */
2232  	btrfs_warn_rl(fs_info,
2233  "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2234  			rbio->bioc->full_stripe_logical, ret);
2235  no_csum:
2236  	kfree(rbio->csum_buf);
2237  	bitmap_free(rbio->csum_bitmap);
2238  	rbio->csum_buf = NULL;
2239  	rbio->csum_bitmap = NULL;
2240  }
2241  
rmw_read_wait_recover(struct btrfs_raid_bio * rbio)2242  static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2243  {
2244  	struct bio_list bio_list = BIO_EMPTY_LIST;
2245  	int total_sector_nr;
2246  	int ret = 0;
2247  
2248  	/*
2249  	 * Fill the data csums we need for data verification.  We need to fill
2250  	 * the csum_bitmap/csum_buf first, as our endio function will try to
2251  	 * verify the data sectors.
2252  	 */
2253  	fill_data_csums(rbio);
2254  
2255  	/*
2256  	 * Build a list of bios to read all sectors (including data and P/Q).
2257  	 *
2258  	 * This behavior is to compensate the later csum verification and recovery.
2259  	 */
2260  	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2261  	     total_sector_nr++) {
2262  		struct sector_ptr *sector;
2263  		int stripe = total_sector_nr / rbio->stripe_nsectors;
2264  		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2265  
2266  		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2267  		ret = rbio_add_io_sector(rbio, &bio_list, sector,
2268  			       stripe, sectornr, REQ_OP_READ);
2269  		if (ret) {
2270  			bio_list_put(&bio_list);
2271  			return ret;
2272  		}
2273  	}
2274  
2275  	/*
2276  	 * We may or may not have any corrupted sectors (including missing dev
2277  	 * and csum mismatch), just let recover_sectors() to handle them all.
2278  	 */
2279  	submit_read_wait_bio_list(rbio, &bio_list);
2280  	return recover_sectors(rbio);
2281  }
2282  
raid_wait_write_end_io(struct bio * bio)2283  static void raid_wait_write_end_io(struct bio *bio)
2284  {
2285  	struct btrfs_raid_bio *rbio = bio->bi_private;
2286  	blk_status_t err = bio->bi_status;
2287  
2288  	if (err)
2289  		rbio_update_error_bitmap(rbio, bio);
2290  	bio_put(bio);
2291  	if (atomic_dec_and_test(&rbio->stripes_pending))
2292  		wake_up(&rbio->io_wait);
2293  }
2294  
submit_write_bios(struct btrfs_raid_bio * rbio,struct bio_list * bio_list)2295  static void submit_write_bios(struct btrfs_raid_bio *rbio,
2296  			      struct bio_list *bio_list)
2297  {
2298  	struct bio *bio;
2299  
2300  	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2301  	while ((bio = bio_list_pop(bio_list))) {
2302  		bio->bi_end_io = raid_wait_write_end_io;
2303  
2304  		if (trace_raid56_write_enabled()) {
2305  			struct raid56_bio_trace_info trace_info = { 0 };
2306  
2307  			bio_get_trace_info(rbio, bio, &trace_info);
2308  			trace_raid56_write(rbio, bio, &trace_info);
2309  		}
2310  		submit_bio(bio);
2311  	}
2312  }
2313  
2314  /*
2315   * To determine if we need to read any sector from the disk.
2316   * Should only be utilized in RMW path, to skip cached rbio.
2317   */
need_read_stripe_sectors(struct btrfs_raid_bio * rbio)2318  static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2319  {
2320  	int i;
2321  
2322  	for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2323  		struct sector_ptr *sector = &rbio->stripe_sectors[i];
2324  
2325  		/*
2326  		 * We have a sector which doesn't have page nor uptodate,
2327  		 * thus this rbio can not be cached one, as cached one must
2328  		 * have all its data sectors present and uptodate.
2329  		 */
2330  		if (!sector->page || !sector->uptodate)
2331  			return true;
2332  	}
2333  	return false;
2334  }
2335  
rmw_rbio(struct btrfs_raid_bio * rbio)2336  static void rmw_rbio(struct btrfs_raid_bio *rbio)
2337  {
2338  	struct bio_list bio_list;
2339  	int sectornr;
2340  	int ret = 0;
2341  
2342  	/*
2343  	 * Allocate the pages for parity first, as P/Q pages will always be
2344  	 * needed for both full-stripe and sub-stripe writes.
2345  	 */
2346  	ret = alloc_rbio_parity_pages(rbio);
2347  	if (ret < 0)
2348  		goto out;
2349  
2350  	/*
2351  	 * Either full stripe write, or we have every data sector already
2352  	 * cached, can go to write path immediately.
2353  	 */
2354  	if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2355  		/*
2356  		 * Now we're doing sub-stripe write, also need all data stripes
2357  		 * to do the full RMW.
2358  		 */
2359  		ret = alloc_rbio_data_pages(rbio);
2360  		if (ret < 0)
2361  			goto out;
2362  
2363  		index_rbio_pages(rbio);
2364  
2365  		ret = rmw_read_wait_recover(rbio);
2366  		if (ret < 0)
2367  			goto out;
2368  	}
2369  
2370  	/*
2371  	 * At this stage we're not allowed to add any new bios to the
2372  	 * bio list any more, anyone else that wants to change this stripe
2373  	 * needs to do their own rmw.
2374  	 */
2375  	spin_lock(&rbio->bio_list_lock);
2376  	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2377  	spin_unlock(&rbio->bio_list_lock);
2378  
2379  	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2380  
2381  	index_rbio_pages(rbio);
2382  
2383  	/*
2384  	 * We don't cache full rbios because we're assuming
2385  	 * the higher layers are unlikely to use this area of
2386  	 * the disk again soon.  If they do use it again,
2387  	 * hopefully they will send another full bio.
2388  	 */
2389  	if (!rbio_is_full(rbio))
2390  		cache_rbio_pages(rbio);
2391  	else
2392  		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2393  
2394  	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2395  		generate_pq_vertical(rbio, sectornr);
2396  
2397  	bio_list_init(&bio_list);
2398  	ret = rmw_assemble_write_bios(rbio, &bio_list);
2399  	if (ret < 0)
2400  		goto out;
2401  
2402  	/* We should have at least one bio assembled. */
2403  	ASSERT(bio_list_size(&bio_list));
2404  	submit_write_bios(rbio, &bio_list);
2405  	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2406  
2407  	/* We may have more errors than our tolerance during the read. */
2408  	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2409  		int found_errors;
2410  
2411  		found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
2412  		if (found_errors > rbio->bioc->max_errors) {
2413  			ret = -EIO;
2414  			break;
2415  		}
2416  	}
2417  out:
2418  	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2419  }
2420  
rmw_rbio_work(struct work_struct * work)2421  static void rmw_rbio_work(struct work_struct *work)
2422  {
2423  	struct btrfs_raid_bio *rbio;
2424  
2425  	rbio = container_of(work, struct btrfs_raid_bio, work);
2426  	if (lock_stripe_add(rbio) == 0)
2427  		rmw_rbio(rbio);
2428  }
2429  
rmw_rbio_work_locked(struct work_struct * work)2430  static void rmw_rbio_work_locked(struct work_struct *work)
2431  {
2432  	rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2433  }
2434  
2435  /*
2436   * The following code is used to scrub/replace the parity stripe
2437   *
2438   * Caller must have already increased bio_counter for getting @bioc.
2439   *
2440   * Note: We need make sure all the pages that add into the scrub/replace
2441   * raid bio are correct and not be changed during the scrub/replace. That
2442   * is those pages just hold metadata or file data with checksum.
2443   */
2444  
raid56_parity_alloc_scrub_rbio(struct bio * bio,struct btrfs_io_context * bioc,struct btrfs_device * scrub_dev,unsigned long * dbitmap,int stripe_nsectors)2445  struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2446  				struct btrfs_io_context *bioc,
2447  				struct btrfs_device *scrub_dev,
2448  				unsigned long *dbitmap, int stripe_nsectors)
2449  {
2450  	struct btrfs_fs_info *fs_info = bioc->fs_info;
2451  	struct btrfs_raid_bio *rbio;
2452  	int i;
2453  
2454  	rbio = alloc_rbio(fs_info, bioc);
2455  	if (IS_ERR(rbio))
2456  		return NULL;
2457  	bio_list_add(&rbio->bio_list, bio);
2458  	/*
2459  	 * This is a special bio which is used to hold the completion handler
2460  	 * and make the scrub rbio is similar to the other types
2461  	 */
2462  	ASSERT(!bio->bi_iter.bi_size);
2463  	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2464  
2465  	/*
2466  	 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2467  	 * to the end position, so this search can start from the first parity
2468  	 * stripe.
2469  	 */
2470  	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2471  		if (bioc->stripes[i].dev == scrub_dev) {
2472  			rbio->scrubp = i;
2473  			break;
2474  		}
2475  	}
2476  	ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
2477  
2478  	bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2479  	return rbio;
2480  }
2481  
2482  /*
2483   * We just scrub the parity that we have correct data on the same horizontal,
2484   * so we needn't allocate all pages for all the stripes.
2485   */
alloc_rbio_essential_pages(struct btrfs_raid_bio * rbio)2486  static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2487  {
2488  	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2489  	int total_sector_nr;
2490  
2491  	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2492  	     total_sector_nr++) {
2493  		struct page *page;
2494  		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2495  		int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2496  
2497  		if (!test_bit(sectornr, &rbio->dbitmap))
2498  			continue;
2499  		if (rbio->stripe_pages[index])
2500  			continue;
2501  		page = alloc_page(GFP_NOFS);
2502  		if (!page)
2503  			return -ENOMEM;
2504  		rbio->stripe_pages[index] = page;
2505  	}
2506  	index_stripe_sectors(rbio);
2507  	return 0;
2508  }
2509  
finish_parity_scrub(struct btrfs_raid_bio * rbio)2510  static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2511  {
2512  	struct btrfs_io_context *bioc = rbio->bioc;
2513  	const u32 sectorsize = bioc->fs_info->sectorsize;
2514  	void **pointers = rbio->finish_pointers;
2515  	unsigned long *pbitmap = &rbio->finish_pbitmap;
2516  	int nr_data = rbio->nr_data;
2517  	int stripe;
2518  	int sectornr;
2519  	bool has_qstripe;
2520  	struct sector_ptr p_sector = { 0 };
2521  	struct sector_ptr q_sector = { 0 };
2522  	struct bio_list bio_list;
2523  	int is_replace = 0;
2524  	int ret;
2525  
2526  	bio_list_init(&bio_list);
2527  
2528  	if (rbio->real_stripes - rbio->nr_data == 1)
2529  		has_qstripe = false;
2530  	else if (rbio->real_stripes - rbio->nr_data == 2)
2531  		has_qstripe = true;
2532  	else
2533  		BUG();
2534  
2535  	/*
2536  	 * Replace is running and our P/Q stripe is being replaced, then we
2537  	 * need to duplicate the final write to replace target.
2538  	 */
2539  	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2540  		is_replace = 1;
2541  		bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2542  	}
2543  
2544  	/*
2545  	 * Because the higher layers(scrubber) are unlikely to
2546  	 * use this area of the disk again soon, so don't cache
2547  	 * it.
2548  	 */
2549  	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2550  
2551  	p_sector.page = alloc_page(GFP_NOFS);
2552  	if (!p_sector.page)
2553  		return -ENOMEM;
2554  	p_sector.pgoff = 0;
2555  	p_sector.uptodate = 1;
2556  
2557  	if (has_qstripe) {
2558  		/* RAID6, allocate and map temp space for the Q stripe */
2559  		q_sector.page = alloc_page(GFP_NOFS);
2560  		if (!q_sector.page) {
2561  			__free_page(p_sector.page);
2562  			p_sector.page = NULL;
2563  			return -ENOMEM;
2564  		}
2565  		q_sector.pgoff = 0;
2566  		q_sector.uptodate = 1;
2567  		pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
2568  	}
2569  
2570  	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2571  
2572  	/* Map the parity stripe just once */
2573  	pointers[nr_data] = kmap_local_page(p_sector.page);
2574  
2575  	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2576  		struct sector_ptr *sector;
2577  		void *parity;
2578  
2579  		/* first collect one page from each data stripe */
2580  		for (stripe = 0; stripe < nr_data; stripe++) {
2581  			sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2582  			pointers[stripe] = kmap_local_page(sector->page) +
2583  					   sector->pgoff;
2584  		}
2585  
2586  		if (has_qstripe) {
2587  			assert_rbio(rbio);
2588  			/* RAID6, call the library function to fill in our P/Q */
2589  			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2590  						pointers);
2591  		} else {
2592  			/* raid5 */
2593  			memcpy(pointers[nr_data], pointers[0], sectorsize);
2594  			run_xor(pointers + 1, nr_data - 1, sectorsize);
2595  		}
2596  
2597  		/* Check scrubbing parity and repair it */
2598  		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2599  		parity = kmap_local_page(sector->page) + sector->pgoff;
2600  		if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2601  			memcpy(parity, pointers[rbio->scrubp], sectorsize);
2602  		else
2603  			/* Parity is right, needn't writeback */
2604  			bitmap_clear(&rbio->dbitmap, sectornr, 1);
2605  		kunmap_local(parity);
2606  
2607  		for (stripe = nr_data - 1; stripe >= 0; stripe--)
2608  			kunmap_local(pointers[stripe]);
2609  	}
2610  
2611  	kunmap_local(pointers[nr_data]);
2612  	__free_page(p_sector.page);
2613  	p_sector.page = NULL;
2614  	if (q_sector.page) {
2615  		kunmap_local(pointers[rbio->real_stripes - 1]);
2616  		__free_page(q_sector.page);
2617  		q_sector.page = NULL;
2618  	}
2619  
2620  	/*
2621  	 * time to start writing.  Make bios for everything from the
2622  	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2623  	 * everything else.
2624  	 */
2625  	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2626  		struct sector_ptr *sector;
2627  
2628  		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2629  		ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2630  					 sectornr, REQ_OP_WRITE);
2631  		if (ret)
2632  			goto cleanup;
2633  	}
2634  
2635  	if (!is_replace)
2636  		goto submit_write;
2637  
2638  	/*
2639  	 * Replace is running and our parity stripe needs to be duplicated to
2640  	 * the target device.  Check we have a valid source stripe number.
2641  	 */
2642  	ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
2643  	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2644  		struct sector_ptr *sector;
2645  
2646  		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2647  		ret = rbio_add_io_sector(rbio, &bio_list, sector,
2648  					 rbio->real_stripes,
2649  					 sectornr, REQ_OP_WRITE);
2650  		if (ret)
2651  			goto cleanup;
2652  	}
2653  
2654  submit_write:
2655  	submit_write_bios(rbio, &bio_list);
2656  	return 0;
2657  
2658  cleanup:
2659  	bio_list_put(&bio_list);
2660  	return ret;
2661  }
2662  
is_data_stripe(struct btrfs_raid_bio * rbio,int stripe)2663  static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2664  {
2665  	if (stripe >= 0 && stripe < rbio->nr_data)
2666  		return 1;
2667  	return 0;
2668  }
2669  
recover_scrub_rbio(struct btrfs_raid_bio * rbio)2670  static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2671  {
2672  	void **pointers = NULL;
2673  	void **unmap_array = NULL;
2674  	int sector_nr;
2675  	int ret = 0;
2676  
2677  	/*
2678  	 * @pointers array stores the pointer for each sector.
2679  	 *
2680  	 * @unmap_array stores copy of pointers that does not get reordered
2681  	 * during reconstruction so that kunmap_local works.
2682  	 */
2683  	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2684  	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2685  	if (!pointers || !unmap_array) {
2686  		ret = -ENOMEM;
2687  		goto out;
2688  	}
2689  
2690  	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2691  		int dfail = 0, failp = -1;
2692  		int faila;
2693  		int failb;
2694  		int found_errors;
2695  
2696  		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2697  							 &faila, &failb);
2698  		if (found_errors > rbio->bioc->max_errors) {
2699  			ret = -EIO;
2700  			goto out;
2701  		}
2702  		if (found_errors == 0)
2703  			continue;
2704  
2705  		/* We should have at least one error here. */
2706  		ASSERT(faila >= 0 || failb >= 0);
2707  
2708  		if (is_data_stripe(rbio, faila))
2709  			dfail++;
2710  		else if (is_parity_stripe(faila))
2711  			failp = faila;
2712  
2713  		if (is_data_stripe(rbio, failb))
2714  			dfail++;
2715  		else if (is_parity_stripe(failb))
2716  			failp = failb;
2717  		/*
2718  		 * Because we can not use a scrubbing parity to repair the
2719  		 * data, so the capability of the repair is declined.  (In the
2720  		 * case of RAID5, we can not repair anything.)
2721  		 */
2722  		if (dfail > rbio->bioc->max_errors - 1) {
2723  			ret = -EIO;
2724  			goto out;
2725  		}
2726  		/*
2727  		 * If all data is good, only parity is correctly, just repair
2728  		 * the parity, no need to recover data stripes.
2729  		 */
2730  		if (dfail == 0)
2731  			continue;
2732  
2733  		/*
2734  		 * Here means we got one corrupted data stripe and one
2735  		 * corrupted parity on RAID6, if the corrupted parity is
2736  		 * scrubbing parity, luckily, use the other one to repair the
2737  		 * data, or we can not repair the data stripe.
2738  		 */
2739  		if (failp != rbio->scrubp) {
2740  			ret = -EIO;
2741  			goto out;
2742  		}
2743  
2744  		ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2745  		if (ret < 0)
2746  			goto out;
2747  	}
2748  out:
2749  	kfree(pointers);
2750  	kfree(unmap_array);
2751  	return ret;
2752  }
2753  
scrub_assemble_read_bios(struct btrfs_raid_bio * rbio)2754  static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2755  {
2756  	struct bio_list bio_list = BIO_EMPTY_LIST;
2757  	int total_sector_nr;
2758  	int ret = 0;
2759  
2760  	/* Build a list of bios to read all the missing parts. */
2761  	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2762  	     total_sector_nr++) {
2763  		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2764  		int stripe = total_sector_nr / rbio->stripe_nsectors;
2765  		struct sector_ptr *sector;
2766  
2767  		/* No data in the vertical stripe, no need to read. */
2768  		if (!test_bit(sectornr, &rbio->dbitmap))
2769  			continue;
2770  
2771  		/*
2772  		 * We want to find all the sectors missing from the rbio and
2773  		 * read them from the disk. If sector_in_rbio() finds a sector
2774  		 * in the bio list we don't need to read it off the stripe.
2775  		 */
2776  		sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2777  		if (sector)
2778  			continue;
2779  
2780  		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2781  		/*
2782  		 * The bio cache may have handed us an uptodate sector.  If so,
2783  		 * use it.
2784  		 */
2785  		if (sector->uptodate)
2786  			continue;
2787  
2788  		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2789  					 sectornr, REQ_OP_READ);
2790  		if (ret) {
2791  			bio_list_put(&bio_list);
2792  			return ret;
2793  		}
2794  	}
2795  
2796  	submit_read_wait_bio_list(rbio, &bio_list);
2797  	return 0;
2798  }
2799  
scrub_rbio(struct btrfs_raid_bio * rbio)2800  static void scrub_rbio(struct btrfs_raid_bio *rbio)
2801  {
2802  	int sector_nr;
2803  	int ret;
2804  
2805  	ret = alloc_rbio_essential_pages(rbio);
2806  	if (ret)
2807  		goto out;
2808  
2809  	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2810  
2811  	ret = scrub_assemble_read_bios(rbio);
2812  	if (ret < 0)
2813  		goto out;
2814  
2815  	/* We may have some failures, recover the failed sectors first. */
2816  	ret = recover_scrub_rbio(rbio);
2817  	if (ret < 0)
2818  		goto out;
2819  
2820  	/*
2821  	 * We have every sector properly prepared. Can finish the scrub
2822  	 * and writeback the good content.
2823  	 */
2824  	ret = finish_parity_scrub(rbio);
2825  	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2826  	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2827  		int found_errors;
2828  
2829  		found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
2830  		if (found_errors > rbio->bioc->max_errors) {
2831  			ret = -EIO;
2832  			break;
2833  		}
2834  	}
2835  out:
2836  	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2837  }
2838  
scrub_rbio_work_locked(struct work_struct * work)2839  static void scrub_rbio_work_locked(struct work_struct *work)
2840  {
2841  	scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2842  }
2843  
raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio * rbio)2844  void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2845  {
2846  	if (!lock_stripe_add(rbio))
2847  		start_async_work(rbio, scrub_rbio_work_locked);
2848  }
2849  
2850  /*
2851   * This is for scrub call sites where we already have correct data contents.
2852   * This allows us to avoid reading data stripes again.
2853   *
2854   * Unfortunately here we have to do page copy, other than reusing the pages.
2855   * This is due to the fact rbio has its own page management for its cache.
2856   */
raid56_parity_cache_data_pages(struct btrfs_raid_bio * rbio,struct page ** data_pages,u64 data_logical)2857  void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
2858  				    struct page **data_pages, u64 data_logical)
2859  {
2860  	const u64 offset_in_full_stripe = data_logical -
2861  					  rbio->bioc->full_stripe_logical;
2862  	const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
2863  	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2864  	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
2865  	int ret;
2866  
2867  	/*
2868  	 * If we hit ENOMEM temporarily, but later at
2869  	 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2870  	 * the extra read, not a big deal.
2871  	 *
2872  	 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2873  	 * the bio would got proper error number set.
2874  	 */
2875  	ret = alloc_rbio_data_pages(rbio);
2876  	if (ret < 0)
2877  		return;
2878  
2879  	/* data_logical must be at stripe boundary and inside the full stripe. */
2880  	ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
2881  	ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
2882  
2883  	for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
2884  		struct page *dst = rbio->stripe_pages[page_nr + page_index];
2885  		struct page *src = data_pages[page_nr];
2886  
2887  		memcpy_page(dst, 0, src, 0, PAGE_SIZE);
2888  		for (int sector_nr = sectors_per_page * page_index;
2889  		     sector_nr < sectors_per_page * (page_index + 1);
2890  		     sector_nr++)
2891  			rbio->stripe_sectors[sector_nr].uptodate = true;
2892  	}
2893  }
2894