1  // SPDX-License-Identifier: GPL-2.0
2  /* Maximum size of each resync request */
3  #define RESYNC_BLOCK_SIZE (64*1024)
4  #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
5  
6  /*
7   * Number of guaranteed raid bios in case of extreme VM load:
8   */
9  #define	NR_RAID_BIOS 256
10  
11  /* when we get a read error on a read-only array, we redirect to another
12   * device without failing the first device, or trying to over-write to
13   * correct the read error.  To keep track of bad blocks on a per-bio
14   * level, we store IO_BLOCKED in the appropriate 'bios' pointer
15   */
16  #define IO_BLOCKED ((struct bio *)1)
17  /* When we successfully write to a known bad-block, we need to remove the
18   * bad-block marking which must be done from process context.  So we record
19   * the success by setting devs[n].bio to IO_MADE_GOOD
20   */
21  #define IO_MADE_GOOD ((struct bio *)2)
22  
23  #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
24  #define MAX_PLUG_BIO 32
25  
26  /* for managing resync I/O pages */
27  struct resync_pages {
28  	void		*raid_bio;
29  	struct page	*pages[RESYNC_PAGES];
30  };
31  
32  struct raid1_plug_cb {
33  	struct blk_plug_cb	cb;
34  	struct bio_list		pending;
35  	unsigned int		count;
36  };
37  
rbio_pool_free(void * rbio,void * data)38  static void rbio_pool_free(void *rbio, void *data)
39  {
40  	kfree(rbio);
41  }
42  
resync_alloc_pages(struct resync_pages * rp,gfp_t gfp_flags)43  static inline int resync_alloc_pages(struct resync_pages *rp,
44  				     gfp_t gfp_flags)
45  {
46  	int i;
47  
48  	for (i = 0; i < RESYNC_PAGES; i++) {
49  		rp->pages[i] = alloc_page(gfp_flags);
50  		if (!rp->pages[i])
51  			goto out_free;
52  	}
53  
54  	return 0;
55  
56  out_free:
57  	while (--i >= 0)
58  		put_page(rp->pages[i]);
59  	return -ENOMEM;
60  }
61  
resync_free_pages(struct resync_pages * rp)62  static inline void resync_free_pages(struct resync_pages *rp)
63  {
64  	int i;
65  
66  	for (i = 0; i < RESYNC_PAGES; i++)
67  		put_page(rp->pages[i]);
68  }
69  
resync_get_all_pages(struct resync_pages * rp)70  static inline void resync_get_all_pages(struct resync_pages *rp)
71  {
72  	int i;
73  
74  	for (i = 0; i < RESYNC_PAGES; i++)
75  		get_page(rp->pages[i]);
76  }
77  
resync_fetch_page(struct resync_pages * rp,unsigned idx)78  static inline struct page *resync_fetch_page(struct resync_pages *rp,
79  					     unsigned idx)
80  {
81  	if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
82  		return NULL;
83  	return rp->pages[idx];
84  }
85  
86  /*
87   * 'strct resync_pages' stores actual pages used for doing the resync
88   *  IO, and it is per-bio, so make .bi_private points to it.
89   */
get_resync_pages(struct bio * bio)90  static inline struct resync_pages *get_resync_pages(struct bio *bio)
91  {
92  	return bio->bi_private;
93  }
94  
95  /* generally called after bio_reset() for reseting bvec */
md_bio_reset_resync_pages(struct bio * bio,struct resync_pages * rp,int size)96  static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
97  			       int size)
98  {
99  	int idx = 0;
100  
101  	/* initialize bvec table again */
102  	do {
103  		struct page *page = resync_fetch_page(rp, idx);
104  		int len = min_t(int, size, PAGE_SIZE);
105  
106  		if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
107  			bio->bi_status = BLK_STS_RESOURCE;
108  			bio_endio(bio);
109  			return;
110  		}
111  
112  		size -= len;
113  	} while (idx++ < RESYNC_PAGES && size > 0);
114  }
115  
116  
raid1_submit_write(struct bio * bio)117  static inline void raid1_submit_write(struct bio *bio)
118  {
119  	struct md_rdev *rdev = (void *)bio->bi_bdev;
120  
121  	bio->bi_next = NULL;
122  	bio_set_dev(bio, rdev->bdev);
123  	if (test_bit(Faulty, &rdev->flags))
124  		bio_io_error(bio);
125  	else if (unlikely(bio_op(bio) ==  REQ_OP_DISCARD &&
126  			  !bdev_max_discard_sectors(bio->bi_bdev)))
127  		/* Just ignore it */
128  		bio_endio(bio);
129  	else
130  		submit_bio_noacct(bio);
131  }
132  
raid1_add_bio_to_plug(struct mddev * mddev,struct bio * bio,blk_plug_cb_fn unplug,int copies)133  static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
134  				      blk_plug_cb_fn unplug, int copies)
135  {
136  	struct raid1_plug_cb *plug = NULL;
137  	struct blk_plug_cb *cb;
138  
139  	/*
140  	 * If bitmap is not enabled, it's safe to submit the io directly, and
141  	 * this can get optimal performance.
142  	 */
143  	if (!mddev->bitmap_ops->enabled(mddev)) {
144  		raid1_submit_write(bio);
145  		return true;
146  	}
147  
148  	cb = blk_check_plugged(unplug, mddev, sizeof(*plug));
149  	if (!cb)
150  		return false;
151  
152  	plug = container_of(cb, struct raid1_plug_cb, cb);
153  	bio_list_add(&plug->pending, bio);
154  	if (++plug->count / MAX_PLUG_BIO >= copies) {
155  		list_del(&cb->list);
156  		cb->callback(cb, false);
157  	}
158  
159  
160  	return true;
161  }
162  
163  /*
164   * current->bio_list will be set under submit_bio() context, in this case bitmap
165   * io will be added to the list and wait for current io submission to finish,
166   * while current io submission must wait for bitmap io to be done. In order to
167   * avoid such deadlock, submit bitmap io asynchronously.
168   */
raid1_prepare_flush_writes(struct mddev * mddev)169  static inline void raid1_prepare_flush_writes(struct mddev *mddev)
170  {
171  	mddev->bitmap_ops->unplug(mddev, current->bio_list == NULL);
172  }
173  
174  /*
175   * Used by fix_read_error() to decay the per rdev read_errors.
176   * We halve the read error count for every hour that has elapsed
177   * since the last recorded read error.
178   */
check_decay_read_errors(struct mddev * mddev,struct md_rdev * rdev)179  static inline void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
180  {
181  	long cur_time_mon;
182  	unsigned long hours_since_last;
183  	unsigned int read_errors = atomic_read(&rdev->read_errors);
184  
185  	cur_time_mon = ktime_get_seconds();
186  
187  	if (rdev->last_read_error == 0) {
188  		/* first time we've seen a read error */
189  		rdev->last_read_error = cur_time_mon;
190  		return;
191  	}
192  
193  	hours_since_last = (long)(cur_time_mon -
194  			    rdev->last_read_error) / 3600;
195  
196  	rdev->last_read_error = cur_time_mon;
197  
198  	/*
199  	 * if hours_since_last is > the number of bits in read_errors
200  	 * just set read errors to 0. We do this to avoid
201  	 * overflowing the shift of read_errors by hours_since_last.
202  	 */
203  	if (hours_since_last >= 8 * sizeof(read_errors))
204  		atomic_set(&rdev->read_errors, 0);
205  	else
206  		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
207  }
208  
exceed_read_errors(struct mddev * mddev,struct md_rdev * rdev)209  static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
210  {
211  	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
212  	int read_errors;
213  
214  	check_decay_read_errors(mddev, rdev);
215  	read_errors =  atomic_inc_return(&rdev->read_errors);
216  	if (read_errors > max_read_errors) {
217  		pr_notice("md/"RAID_1_10_NAME":%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
218  			  mdname(mddev), rdev->bdev, read_errors, max_read_errors);
219  		pr_notice("md/"RAID_1_10_NAME":%s: %pg: Failing raid device\n",
220  			  mdname(mddev), rdev->bdev);
221  		md_error(mddev, rdev);
222  		return true;
223  	}
224  
225  	return false;
226  }
227  
228  /**
229   * raid1_check_read_range() - check a given read range for bad blocks,
230   * available read length is returned;
231   * @rdev: the rdev to read;
232   * @this_sector: read position;
233   * @len: read length;
234   *
235   * helper function for read_balance()
236   *
237   * 1) If there are no bad blocks in the range, @len is returned;
238   * 2) If the range are all bad blocks, 0 is returned;
239   * 3) If there are partial bad blocks:
240   *  - If the bad block range starts after @this_sector, the length of first
241   *  good region is returned;
242   *  - If the bad block range starts before @this_sector, 0 is returned and
243   *  the @len is updated to the offset into the region before we get to the
244   *  good blocks;
245   */
raid1_check_read_range(struct md_rdev * rdev,sector_t this_sector,int * len)246  static inline int raid1_check_read_range(struct md_rdev *rdev,
247  					 sector_t this_sector, int *len)
248  {
249  	sector_t first_bad;
250  	int bad_sectors;
251  
252  	/* no bad block overlap */
253  	if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
254  		return *len;
255  
256  	/*
257  	 * bad block range starts offset into our range so we can return the
258  	 * number of sectors before the bad blocks start.
259  	 */
260  	if (first_bad > this_sector)
261  		return first_bad - this_sector;
262  
263  	/* read range is fully consumed by bad blocks. */
264  	if (this_sector + *len <= first_bad + bad_sectors)
265  		return 0;
266  
267  	/*
268  	 * final case, bad block range starts before or at the start of our
269  	 * range but does not cover our entire range so we still return 0 but
270  	 * update the length with the number of sectors before we get to the
271  	 * good ones.
272  	 */
273  	*len = first_bad + bad_sectors - this_sector;
274  	return 0;
275  }
276  
277  /*
278   * Check if read should choose the first rdev.
279   *
280   * Balance on the whole device if no resync is going on (recovery is ok) or
281   * below the resync window. Otherwise, take the first readable disk.
282   */
raid1_should_read_first(struct mddev * mddev,sector_t this_sector,int len)283  static inline bool raid1_should_read_first(struct mddev *mddev,
284  					   sector_t this_sector, int len)
285  {
286  	if ((mddev->recovery_cp < this_sector + len))
287  		return true;
288  
289  	if (mddev_is_clustered(mddev) &&
290  	    md_cluster_ops->area_resyncing(mddev, READ, this_sector,
291  					   this_sector + len))
292  		return true;
293  
294  	return false;
295  }
296