1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * io.c
4  *
5  * Buffer cache handling
6  *
7  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
8  */
9 
10 #include <linux/fs.h>
11 #include <linux/types.h>
12 #include <linux/highmem.h>
13 #include <linux/bio.h>
14 
15 #include <cluster/masklog.h>
16 
17 #include "ocfs2.h"
18 
19 #include "alloc.h"
20 #include "inode.h"
21 #include "journal.h"
22 #include "uptodate.h"
23 #include "buffer_head_io.h"
24 #include "ocfs2_trace.h"
25 
26 /*
27  * Bits on bh->b_state used by ocfs2.
28  *
29  * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
30  */
31 enum ocfs2_state_bits {
32 	BH_NeedsValidate = BH_JBDPrivateStart,
33 };
34 
35 /* Expand the magic b_state functions */
36 BUFFER_FNS(NeedsValidate, needs_validate);
37 
ocfs2_write_block(struct ocfs2_super * osb,struct buffer_head * bh,struct ocfs2_caching_info * ci)38 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
39 		      struct ocfs2_caching_info *ci)
40 {
41 	int ret = 0;
42 
43 	trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
44 
45 	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
46 	BUG_ON(buffer_jbd(bh));
47 
48 	/* No need to check for a soft readonly file system here. non
49 	 * journalled writes are only ever done on system files which
50 	 * can get modified during recovery even if read-only. */
51 	if (ocfs2_is_hard_readonly(osb)) {
52 		ret = -EROFS;
53 		mlog_errno(ret);
54 		goto out;
55 	}
56 
57 	ocfs2_metadata_cache_io_lock(ci);
58 
59 	lock_buffer(bh);
60 	set_buffer_uptodate(bh);
61 
62 	/* remove from dirty list before I/O. */
63 	clear_buffer_dirty(bh);
64 
65 	get_bh(bh); /* for end_buffer_write_sync() */
66 	bh->b_end_io = end_buffer_write_sync;
67 	submit_bh(REQ_OP_WRITE, bh);
68 
69 	wait_on_buffer(bh);
70 
71 	if (buffer_uptodate(bh)) {
72 		ocfs2_set_buffer_uptodate(ci, bh);
73 	} else {
74 		/* We don't need to remove the clustered uptodate
75 		 * information for this bh as it's not marked locally
76 		 * uptodate. */
77 		ret = -EIO;
78 		mlog_errno(ret);
79 	}
80 
81 	ocfs2_metadata_cache_io_unlock(ci);
82 out:
83 	return ret;
84 }
85 
86 /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
87  * will be easier to handle read failure.
88  */
ocfs2_read_blocks_sync(struct ocfs2_super * osb,u64 block,unsigned int nr,struct buffer_head * bhs[])89 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
90 			   unsigned int nr, struct buffer_head *bhs[])
91 {
92 	int status = 0;
93 	unsigned int i;
94 	struct buffer_head *bh;
95 	int new_bh = 0;
96 
97 	trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
98 
99 	if (!nr)
100 		goto bail;
101 
102 	/* Don't put buffer head and re-assign it to NULL if it is allocated
103 	 * outside since the caller can't be aware of this alternation!
104 	 */
105 	new_bh = (bhs[0] == NULL);
106 
107 	for (i = 0 ; i < nr ; i++) {
108 		if (bhs[i] == NULL) {
109 			bhs[i] = sb_getblk(osb->sb, block++);
110 			if (bhs[i] == NULL) {
111 				status = -ENOMEM;
112 				mlog_errno(status);
113 				break;
114 			}
115 		}
116 		bh = bhs[i];
117 
118 		if (buffer_jbd(bh)) {
119 			trace_ocfs2_read_blocks_sync_jbd(
120 					(unsigned long long)bh->b_blocknr);
121 			continue;
122 		}
123 
124 		if (buffer_dirty(bh)) {
125 			/* This should probably be a BUG, or
126 			 * at least return an error. */
127 			mlog(ML_ERROR,
128 			     "trying to sync read a dirty "
129 			     "buffer! (blocknr = %llu), skipping\n",
130 			     (unsigned long long)bh->b_blocknr);
131 			continue;
132 		}
133 
134 		lock_buffer(bh);
135 		if (buffer_jbd(bh)) {
136 #ifdef CATCH_BH_JBD_RACES
137 			mlog(ML_ERROR,
138 			     "block %llu had the JBD bit set "
139 			     "while I was in lock_buffer!",
140 			     (unsigned long long)bh->b_blocknr);
141 			BUG();
142 #else
143 			unlock_buffer(bh);
144 			continue;
145 #endif
146 		}
147 
148 		get_bh(bh); /* for end_buffer_read_sync() */
149 		bh->b_end_io = end_buffer_read_sync;
150 		submit_bh(REQ_OP_READ, bh);
151 	}
152 
153 read_failure:
154 	for (i = nr; i > 0; i--) {
155 		bh = bhs[i - 1];
156 
157 		if (unlikely(status)) {
158 			if (new_bh && bh) {
159 				/* If middle bh fails, let previous bh
160 				 * finish its read and then put it to
161 				 * avoid bh leak
162 				 */
163 				if (!buffer_jbd(bh))
164 					wait_on_buffer(bh);
165 				put_bh(bh);
166 				bhs[i - 1] = NULL;
167 			} else if (bh && buffer_uptodate(bh)) {
168 				clear_buffer_uptodate(bh);
169 			}
170 			continue;
171 		}
172 
173 		/* No need to wait on the buffer if it's managed by JBD. */
174 		if (!buffer_jbd(bh))
175 			wait_on_buffer(bh);
176 
177 		if (!buffer_uptodate(bh)) {
178 			/* Status won't be cleared from here on out,
179 			 * so we can safely record this and loop back
180 			 * to cleanup the other buffers. */
181 			status = -EIO;
182 			goto read_failure;
183 		}
184 	}
185 
186 bail:
187 	return status;
188 }
189 
190 /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it
191  * will be easier to handle read failure.
192  */
ocfs2_read_blocks(struct ocfs2_caching_info * ci,u64 block,int nr,struct buffer_head * bhs[],int flags,int (* validate)(struct super_block * sb,struct buffer_head * bh))193 int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
194 		      struct buffer_head *bhs[], int flags,
195 		      int (*validate)(struct super_block *sb,
196 				      struct buffer_head *bh))
197 {
198 	int status = 0;
199 	int i, ignore_cache = 0;
200 	struct buffer_head *bh;
201 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
202 	int new_bh = 0;
203 
204 	trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
205 
206 	BUG_ON(!ci);
207 	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
208 	       (flags & OCFS2_BH_IGNORE_CACHE));
209 
210 	if (bhs == NULL) {
211 		status = -EINVAL;
212 		mlog_errno(status);
213 		goto bail;
214 	}
215 
216 	if (nr < 0) {
217 		mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
218 		status = -EINVAL;
219 		mlog_errno(status);
220 		goto bail;
221 	}
222 
223 	if (nr == 0) {
224 		status = 0;
225 		goto bail;
226 	}
227 
228 	/* Don't put buffer head and re-assign it to NULL if it is allocated
229 	 * outside since the caller can't be aware of this alternation!
230 	 */
231 	new_bh = (bhs[0] == NULL);
232 
233 	ocfs2_metadata_cache_io_lock(ci);
234 	for (i = 0 ; i < nr ; i++) {
235 		if (bhs[i] == NULL) {
236 			bhs[i] = sb_getblk(sb, block++);
237 			if (bhs[i] == NULL) {
238 				status = -ENOMEM;
239 				mlog_errno(status);
240 				/* Don't forget to put previous bh! */
241 				break;
242 			}
243 		}
244 		bh = bhs[i];
245 		ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
246 
247 		/* There are three read-ahead cases here which we need to
248 		 * be concerned with. All three assume a buffer has
249 		 * previously been submitted with OCFS2_BH_READAHEAD
250 		 * and it hasn't yet completed I/O.
251 		 *
252 		 * 1) The current request is sync to disk. This rarely
253 		 *    happens these days, and never when performance
254 		 *    matters - the code can just wait on the buffer
255 		 *    lock and re-submit.
256 		 *
257 		 * 2) The current request is cached, but not
258 		 *    readahead. ocfs2_buffer_uptodate() will return
259 		 *    false anyway, so we'll wind up waiting on the
260 		 *    buffer lock to do I/O. We re-check the request
261 		 *    with after getting the lock to avoid a re-submit.
262 		 *
263 		 * 3) The current request is readahead (and so must
264 		 *    also be a caching one). We short circuit if the
265 		 *    buffer is locked (under I/O) and if it's in the
266 		 *    uptodate cache. The re-check from #2 catches the
267 		 *    case that the previous read-ahead completes just
268 		 *    before our is-it-in-flight check.
269 		 */
270 
271 		if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
272 			trace_ocfs2_read_blocks_from_disk(
273 			     (unsigned long long)bh->b_blocknr,
274 			     (unsigned long long)ocfs2_metadata_cache_owner(ci));
275 			/* We're using ignore_cache here to say
276 			 * "go to disk" */
277 			ignore_cache = 1;
278 		}
279 
280 		trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
281 			ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
282 
283 		if (buffer_jbd(bh)) {
284 			continue;
285 		}
286 
287 		if (ignore_cache) {
288 			if (buffer_dirty(bh)) {
289 				/* This should probably be a BUG, or
290 				 * at least return an error. */
291 				continue;
292 			}
293 
294 			/* A read-ahead request was made - if the
295 			 * buffer is already under read-ahead from a
296 			 * previously submitted request than we are
297 			 * done here. */
298 			if ((flags & OCFS2_BH_READAHEAD)
299 			    && ocfs2_buffer_read_ahead(ci, bh))
300 				continue;
301 
302 			lock_buffer(bh);
303 			if (buffer_jbd(bh)) {
304 #ifdef CATCH_BH_JBD_RACES
305 				mlog(ML_ERROR, "block %llu had the JBD bit set "
306 					       "while I was in lock_buffer!",
307 				     (unsigned long long)bh->b_blocknr);
308 				BUG();
309 #else
310 				unlock_buffer(bh);
311 				continue;
312 #endif
313 			}
314 
315 			/* Re-check ocfs2_buffer_uptodate() as a
316 			 * previously read-ahead buffer may have
317 			 * completed I/O while we were waiting for the
318 			 * buffer lock. */
319 			if (!(flags & OCFS2_BH_IGNORE_CACHE)
320 			    && !(flags & OCFS2_BH_READAHEAD)
321 			    && ocfs2_buffer_uptodate(ci, bh)) {
322 				unlock_buffer(bh);
323 				continue;
324 			}
325 
326 			get_bh(bh); /* for end_buffer_read_sync() */
327 			if (validate)
328 				set_buffer_needs_validate(bh);
329 			bh->b_end_io = end_buffer_read_sync;
330 			submit_bh(REQ_OP_READ, bh);
331 			continue;
332 		}
333 	}
334 
335 read_failure:
336 	for (i = (nr - 1); i >= 0; i--) {
337 		bh = bhs[i];
338 
339 		if (!(flags & OCFS2_BH_READAHEAD)) {
340 			if (unlikely(status)) {
341 				/* Clear the buffers on error including those
342 				 * ever succeeded in reading
343 				 */
344 				if (new_bh && bh) {
345 					/* If middle bh fails, let previous bh
346 					 * finish its read and then put it to
347 					 * avoid bh leak
348 					 */
349 					if (!buffer_jbd(bh))
350 						wait_on_buffer(bh);
351 					put_bh(bh);
352 					bhs[i] = NULL;
353 				} else if (bh && buffer_uptodate(bh)) {
354 					clear_buffer_uptodate(bh);
355 				}
356 				continue;
357 			}
358 			/* We know this can't have changed as we hold the
359 			 * owner sem. Avoid doing any work on the bh if the
360 			 * journal has it. */
361 			if (!buffer_jbd(bh))
362 				wait_on_buffer(bh);
363 
364 			if (!buffer_uptodate(bh)) {
365 				/* Status won't be cleared from here on out,
366 				 * so we can safely record this and loop back
367 				 * to cleanup the other buffers. Don't need to
368 				 * remove the clustered uptodate information
369 				 * for this bh as it's not marked locally
370 				 * uptodate. */
371 				status = -EIO;
372 				clear_buffer_needs_validate(bh);
373 				goto read_failure;
374 			}
375 
376 			if (buffer_needs_validate(bh)) {
377 				/* We never set NeedsValidate if the
378 				 * buffer was held by the journal, so
379 				 * that better not have changed */
380 				BUG_ON(buffer_jbd(bh));
381 				clear_buffer_needs_validate(bh);
382 				status = validate(sb, bh);
383 				if (status)
384 					goto read_failure;
385 			}
386 		}
387 
388 		/* Always set the buffer in the cache, even if it was
389 		 * a forced read, or read-ahead which hasn't yet
390 		 * completed. */
391 		if (bh)
392 			ocfs2_set_buffer_uptodate(ci, bh);
393 	}
394 	ocfs2_metadata_cache_io_unlock(ci);
395 
396 	trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
397 				    flags, ignore_cache);
398 
399 bail:
400 
401 	return status;
402 }
403 
404 /* Check whether the blkno is the super block or one of the backups. */
ocfs2_check_super_or_backup(struct super_block * sb,sector_t blkno)405 static void ocfs2_check_super_or_backup(struct super_block *sb,
406 					sector_t blkno)
407 {
408 	int i;
409 	u64 backup_blkno;
410 
411 	if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
412 		return;
413 
414 	for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
415 		backup_blkno = ocfs2_backup_super_blkno(sb, i);
416 		if (backup_blkno == blkno)
417 			return;
418 	}
419 
420 	BUG();
421 }
422 
423 /*
424  * Write super block and backups doesn't need to collaborate with journal,
425  * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
426  * into this function.
427  */
ocfs2_write_super_or_backup(struct ocfs2_super * osb,struct buffer_head * bh)428 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
429 				struct buffer_head *bh)
430 {
431 	int ret = 0;
432 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
433 
434 	BUG_ON(buffer_jbd(bh));
435 	ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
436 
437 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
438 		ret = -EROFS;
439 		mlog_errno(ret);
440 		goto out;
441 	}
442 
443 	lock_buffer(bh);
444 	set_buffer_uptodate(bh);
445 
446 	/* remove from dirty list before I/O. */
447 	clear_buffer_dirty(bh);
448 
449 	get_bh(bh); /* for end_buffer_write_sync() */
450 	bh->b_end_io = end_buffer_write_sync;
451 	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
452 	submit_bh(REQ_OP_WRITE, bh);
453 
454 	wait_on_buffer(bh);
455 
456 	if (!buffer_uptodate(bh)) {
457 		ret = -EIO;
458 		mlog_errno(ret);
459 	}
460 
461 out:
462 	return ret;
463 }
464