1  // SPDX-License-Identifier: GPL-2.0+
2  /*
3   * linux/fs/jbd2/transaction.c
4   *
5   * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6   *
7   * Copyright 1998 Red Hat corp --- All Rights Reserved
8   *
9   * Generic filesystem transaction handling code; part of the ext2fs
10   * journaling system.
11   *
12   * This file manages transactions (compound commits managed by the
13   * journaling code) and handles (individual atomic operations by the
14   * filesystem).
15   */
16  
17  #include <linux/time.h>
18  #include <linux/fs.h>
19  #include <linux/jbd2.h>
20  #include <linux/errno.h>
21  #include <linux/slab.h>
22  #include <linux/timer.h>
23  #include <linux/mm.h>
24  #include <linux/highmem.h>
25  #include <linux/hrtimer.h>
26  #include <linux/backing-dev.h>
27  #include <linux/bug.h>
28  #include <linux/module.h>
29  #include <linux/sched/mm.h>
30  
31  #include <trace/events/jbd2.h>
32  
33  static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
34  static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
35  
36  static struct kmem_cache *transaction_cache;
jbd2_journal_init_transaction_cache(void)37  int __init jbd2_journal_init_transaction_cache(void)
38  {
39  	J_ASSERT(!transaction_cache);
40  	transaction_cache = kmem_cache_create("jbd2_transaction_s",
41  					sizeof(transaction_t),
42  					0,
43  					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
44  					NULL);
45  	if (!transaction_cache) {
46  		pr_emerg("JBD2: failed to create transaction cache\n");
47  		return -ENOMEM;
48  	}
49  	return 0;
50  }
51  
jbd2_journal_destroy_transaction_cache(void)52  void jbd2_journal_destroy_transaction_cache(void)
53  {
54  	kmem_cache_destroy(transaction_cache);
55  	transaction_cache = NULL;
56  }
57  
jbd2_journal_free_transaction(transaction_t * transaction)58  void jbd2_journal_free_transaction(transaction_t *transaction)
59  {
60  	if (unlikely(ZERO_OR_NULL_PTR(transaction)))
61  		return;
62  	kmem_cache_free(transaction_cache, transaction);
63  }
64  
65  /*
66   * jbd2_get_transaction: obtain a new transaction_t object.
67   *
68   * Simply initialise a new transaction. Initialize it in
69   * RUNNING state and add it to the current journal (which should not
70   * have an existing running transaction: we only make a new transaction
71   * once we have started to commit the old one).
72   *
73   * Preconditions:
74   *	The journal MUST be locked.  We don't perform atomic mallocs on the
75   *	new transaction	and we can't block without protecting against other
76   *	processes trying to touch the journal while it is in transition.
77   *
78   */
79  
jbd2_get_transaction(journal_t * journal,transaction_t * transaction)80  static void jbd2_get_transaction(journal_t *journal,
81  				transaction_t *transaction)
82  {
83  	transaction->t_journal = journal;
84  	transaction->t_state = T_RUNNING;
85  	transaction->t_start_time = ktime_get();
86  	transaction->t_tid = journal->j_transaction_sequence++;
87  	transaction->t_expires = jiffies + journal->j_commit_interval;
88  	atomic_set(&transaction->t_updates, 0);
89  	atomic_set(&transaction->t_outstanding_credits,
90  		   journal->j_transaction_overhead_buffers +
91  		   atomic_read(&journal->j_reserved_credits));
92  	atomic_set(&transaction->t_outstanding_revokes, 0);
93  	atomic_set(&transaction->t_handle_count, 0);
94  	INIT_LIST_HEAD(&transaction->t_inode_list);
95  	INIT_LIST_HEAD(&transaction->t_private_list);
96  
97  	/* Set up the commit timer for the new transaction. */
98  	journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
99  	add_timer(&journal->j_commit_timer);
100  
101  	J_ASSERT(journal->j_running_transaction == NULL);
102  	journal->j_running_transaction = transaction;
103  	transaction->t_max_wait = 0;
104  	transaction->t_start = jiffies;
105  	transaction->t_requested = 0;
106  }
107  
108  /*
109   * Handle management.
110   *
111   * A handle_t is an object which represents a single atomic update to a
112   * filesystem, and which tracks all of the modifications which form part
113   * of that one update.
114   */
115  
116  /*
117   * Update transaction's maximum wait time, if debugging is enabled.
118   *
119   * t_max_wait is carefully updated here with use of atomic compare exchange.
120   * Note that there could be multiplre threads trying to do this simultaneously
121   * hence using cmpxchg to avoid any use of locks in this case.
122   * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
123   */
update_t_max_wait(transaction_t * transaction,unsigned long ts)124  static inline void update_t_max_wait(transaction_t *transaction,
125  				     unsigned long ts)
126  {
127  	unsigned long oldts, newts;
128  
129  	if (time_after(transaction->t_start, ts)) {
130  		newts = jbd2_time_diff(ts, transaction->t_start);
131  		oldts = READ_ONCE(transaction->t_max_wait);
132  		while (oldts < newts)
133  			oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
134  	}
135  }
136  
137  /*
138   * Wait until running transaction passes to T_FLUSH state and new transaction
139   * can thus be started. Also starts the commit if needed. The function expects
140   * running transaction to exist and releases j_state_lock.
141   */
wait_transaction_locked(journal_t * journal)142  static void wait_transaction_locked(journal_t *journal)
143  	__releases(journal->j_state_lock)
144  {
145  	DEFINE_WAIT(wait);
146  	int need_to_start;
147  	tid_t tid = journal->j_running_transaction->t_tid;
148  
149  	prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
150  			TASK_UNINTERRUPTIBLE);
151  	need_to_start = !tid_geq(journal->j_commit_request, tid);
152  	read_unlock(&journal->j_state_lock);
153  	if (need_to_start)
154  		jbd2_log_start_commit(journal, tid);
155  	jbd2_might_wait_for_commit(journal);
156  	schedule();
157  	finish_wait(&journal->j_wait_transaction_locked, &wait);
158  }
159  
160  /*
161   * Wait until running transaction transitions from T_SWITCH to T_FLUSH
162   * state and new transaction can thus be started. The function releases
163   * j_state_lock.
164   */
wait_transaction_switching(journal_t * journal)165  static void wait_transaction_switching(journal_t *journal)
166  	__releases(journal->j_state_lock)
167  {
168  	DEFINE_WAIT(wait);
169  
170  	if (WARN_ON(!journal->j_running_transaction ||
171  		    journal->j_running_transaction->t_state != T_SWITCH)) {
172  		read_unlock(&journal->j_state_lock);
173  		return;
174  	}
175  	prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
176  			TASK_UNINTERRUPTIBLE);
177  	read_unlock(&journal->j_state_lock);
178  	/*
179  	 * We don't call jbd2_might_wait_for_commit() here as there's no
180  	 * waiting for outstanding handles happening anymore in T_SWITCH state
181  	 * and handling of reserved handles actually relies on that for
182  	 * correctness.
183  	 */
184  	schedule();
185  	finish_wait(&journal->j_wait_transaction_locked, &wait);
186  }
187  
sub_reserved_credits(journal_t * journal,int blocks)188  static void sub_reserved_credits(journal_t *journal, int blocks)
189  {
190  	atomic_sub(blocks, &journal->j_reserved_credits);
191  	wake_up(&journal->j_wait_reserved);
192  }
193  
194  /* Maximum number of blocks for user transaction payload */
jbd2_max_user_trans_buffers(journal_t * journal)195  static int jbd2_max_user_trans_buffers(journal_t *journal)
196  {
197  	return journal->j_max_transaction_buffers -
198  				journal->j_transaction_overhead_buffers;
199  }
200  
201  /*
202   * Wait until we can add credits for handle to the running transaction.  Called
203   * with j_state_lock held for reading. Returns 0 if handle joined the running
204   * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
205   * caller must retry.
206   *
207   * Note: because j_state_lock may be dropped depending on the return
208   * value, we need to fake out sparse so ti doesn't complain about a
209   * locking imbalance.  Callers of add_transaction_credits will need to
210   * make a similar accomodation.
211   */
add_transaction_credits(journal_t * journal,int blocks,int rsv_blocks)212  static int add_transaction_credits(journal_t *journal, int blocks,
213  				   int rsv_blocks)
214  __must_hold(&journal->j_state_lock)
215  {
216  	transaction_t *t = journal->j_running_transaction;
217  	int needed;
218  	int total = blocks + rsv_blocks;
219  
220  	/*
221  	 * If the current transaction is locked down for commit, wait
222  	 * for the lock to be released.
223  	 */
224  	if (t->t_state != T_RUNNING) {
225  		WARN_ON_ONCE(t->t_state >= T_FLUSH);
226  		wait_transaction_locked(journal);
227  		__acquire(&journal->j_state_lock); /* fake out sparse */
228  		return 1;
229  	}
230  
231  	/*
232  	 * If there is not enough space left in the log to write all
233  	 * potential buffers requested by this operation, we need to
234  	 * stall pending a log checkpoint to free some more log space.
235  	 */
236  	needed = atomic_add_return(total, &t->t_outstanding_credits);
237  	if (needed > journal->j_max_transaction_buffers) {
238  		/*
239  		 * If the current transaction is already too large,
240  		 * then start to commit it: we can then go back and
241  		 * attach this handle to a new transaction.
242  		 */
243  		atomic_sub(total, &t->t_outstanding_credits);
244  
245  		/*
246  		 * Is the number of reserved credits in the current transaction too
247  		 * big to fit this handle? Wait until reserved credits are freed.
248  		 */
249  		if (atomic_read(&journal->j_reserved_credits) + total >
250  		    jbd2_max_user_trans_buffers(journal)) {
251  			read_unlock(&journal->j_state_lock);
252  			jbd2_might_wait_for_commit(journal);
253  			wait_event(journal->j_wait_reserved,
254  				   atomic_read(&journal->j_reserved_credits) + total <=
255  				   jbd2_max_user_trans_buffers(journal));
256  			__acquire(&journal->j_state_lock); /* fake out sparse */
257  			return 1;
258  		}
259  
260  		wait_transaction_locked(journal);
261  		__acquire(&journal->j_state_lock); /* fake out sparse */
262  		return 1;
263  	}
264  
265  	/*
266  	 * The commit code assumes that it can get enough log space
267  	 * without forcing a checkpoint.  This is *critical* for
268  	 * correctness: a checkpoint of a buffer which is also
269  	 * associated with a committing transaction creates a deadlock,
270  	 * so commit simply cannot force through checkpoints.
271  	 *
272  	 * We must therefore ensure the necessary space in the journal
273  	 * *before* starting to dirty potentially checkpointed buffers
274  	 * in the new transaction.
275  	 */
276  	if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
277  		atomic_sub(total, &t->t_outstanding_credits);
278  		read_unlock(&journal->j_state_lock);
279  		jbd2_might_wait_for_commit(journal);
280  		write_lock(&journal->j_state_lock);
281  		if (jbd2_log_space_left(journal) <
282  					journal->j_max_transaction_buffers)
283  			__jbd2_log_wait_for_space(journal);
284  		write_unlock(&journal->j_state_lock);
285  		__acquire(&journal->j_state_lock); /* fake out sparse */
286  		return 1;
287  	}
288  
289  	/* No reservation? We are done... */
290  	if (!rsv_blocks)
291  		return 0;
292  
293  	needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
294  	/* We allow at most half of a transaction to be reserved */
295  	if (needed > jbd2_max_user_trans_buffers(journal) / 2) {
296  		sub_reserved_credits(journal, rsv_blocks);
297  		atomic_sub(total, &t->t_outstanding_credits);
298  		read_unlock(&journal->j_state_lock);
299  		jbd2_might_wait_for_commit(journal);
300  		wait_event(journal->j_wait_reserved,
301  			 atomic_read(&journal->j_reserved_credits) + rsv_blocks
302  			 <= jbd2_max_user_trans_buffers(journal) / 2);
303  		__acquire(&journal->j_state_lock); /* fake out sparse */
304  		return 1;
305  	}
306  	return 0;
307  }
308  
309  /*
310   * start_this_handle: Given a handle, deal with any locking or stalling
311   * needed to make sure that there is enough journal space for the handle
312   * to begin.  Attach the handle to a transaction and set up the
313   * transaction's buffer credits.
314   */
315  
start_this_handle(journal_t * journal,handle_t * handle,gfp_t gfp_mask)316  static int start_this_handle(journal_t *journal, handle_t *handle,
317  			     gfp_t gfp_mask)
318  {
319  	transaction_t	*transaction, *new_transaction = NULL;
320  	int		blocks = handle->h_total_credits;
321  	int		rsv_blocks = 0;
322  	unsigned long ts = jiffies;
323  
324  	if (handle->h_rsv_handle)
325  		rsv_blocks = handle->h_rsv_handle->h_total_credits;
326  
327  	/*
328  	 * Limit the number of reserved credits to 1/2 of maximum transaction
329  	 * size and limit the number of total credits to not exceed maximum
330  	 * transaction size per operation.
331  	 */
332  	if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / 2 ||
333  	    rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) {
334  		printk(KERN_ERR "JBD2: %s wants too many credits "
335  		       "credits:%d rsv_credits:%d max:%d\n",
336  		       current->comm, blocks, rsv_blocks,
337  		       jbd2_max_user_trans_buffers(journal));
338  		WARN_ON(1);
339  		return -ENOSPC;
340  	}
341  
342  alloc_transaction:
343  	/*
344  	 * This check is racy but it is just an optimization of allocating new
345  	 * transaction early if there are high chances we'll need it. If we
346  	 * guess wrong, we'll retry or free unused transaction.
347  	 */
348  	if (!data_race(journal->j_running_transaction)) {
349  		/*
350  		 * If __GFP_FS is not present, then we may be being called from
351  		 * inside the fs writeback layer, so we MUST NOT fail.
352  		 */
353  		if ((gfp_mask & __GFP_FS) == 0)
354  			gfp_mask |= __GFP_NOFAIL;
355  		new_transaction = kmem_cache_zalloc(transaction_cache,
356  						    gfp_mask);
357  		if (!new_transaction)
358  			return -ENOMEM;
359  	}
360  
361  	jbd2_debug(3, "New handle %p going live.\n", handle);
362  
363  	/*
364  	 * We need to hold j_state_lock until t_updates has been incremented,
365  	 * for proper journal barrier handling
366  	 */
367  repeat:
368  	read_lock(&journal->j_state_lock);
369  	BUG_ON(journal->j_flags & JBD2_UNMOUNT);
370  	if (is_journal_aborted(journal) ||
371  	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
372  		read_unlock(&journal->j_state_lock);
373  		jbd2_journal_free_transaction(new_transaction);
374  		return -EROFS;
375  	}
376  
377  	/*
378  	 * Wait on the journal's transaction barrier if necessary. Specifically
379  	 * we allow reserved handles to proceed because otherwise commit could
380  	 * deadlock on page writeback not being able to complete.
381  	 */
382  	if (!handle->h_reserved && journal->j_barrier_count) {
383  		read_unlock(&journal->j_state_lock);
384  		wait_event(journal->j_wait_transaction_locked,
385  				journal->j_barrier_count == 0);
386  		goto repeat;
387  	}
388  
389  	if (!journal->j_running_transaction) {
390  		read_unlock(&journal->j_state_lock);
391  		if (!new_transaction)
392  			goto alloc_transaction;
393  		write_lock(&journal->j_state_lock);
394  		if (!journal->j_running_transaction &&
395  		    (handle->h_reserved || !journal->j_barrier_count)) {
396  			jbd2_get_transaction(journal, new_transaction);
397  			new_transaction = NULL;
398  		}
399  		write_unlock(&journal->j_state_lock);
400  		goto repeat;
401  	}
402  
403  	transaction = journal->j_running_transaction;
404  
405  	if (!handle->h_reserved) {
406  		/* We may have dropped j_state_lock - restart in that case */
407  		if (add_transaction_credits(journal, blocks, rsv_blocks)) {
408  			/*
409  			 * add_transaction_credits releases
410  			 * j_state_lock on a non-zero return
411  			 */
412  			__release(&journal->j_state_lock);
413  			goto repeat;
414  		}
415  	} else {
416  		/*
417  		 * We have handle reserved so we are allowed to join T_LOCKED
418  		 * transaction and we don't have to check for transaction size
419  		 * and journal space. But we still have to wait while running
420  		 * transaction is being switched to a committing one as it
421  		 * won't wait for any handles anymore.
422  		 */
423  		if (transaction->t_state == T_SWITCH) {
424  			wait_transaction_switching(journal);
425  			goto repeat;
426  		}
427  		sub_reserved_credits(journal, blocks);
428  		handle->h_reserved = 0;
429  	}
430  
431  	/* OK, account for the buffers that this operation expects to
432  	 * use and add the handle to the running transaction.
433  	 */
434  	update_t_max_wait(transaction, ts);
435  	handle->h_transaction = transaction;
436  	handle->h_requested_credits = blocks;
437  	handle->h_revoke_credits_requested = handle->h_revoke_credits;
438  	handle->h_start_jiffies = jiffies;
439  	atomic_inc(&transaction->t_updates);
440  	atomic_inc(&transaction->t_handle_count);
441  	jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
442  		  handle, blocks,
443  		  atomic_read(&transaction->t_outstanding_credits),
444  		  jbd2_log_space_left(journal));
445  	read_unlock(&journal->j_state_lock);
446  	current->journal_info = handle;
447  
448  	rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
449  	jbd2_journal_free_transaction(new_transaction);
450  	/*
451  	 * Ensure that no allocations done while the transaction is open are
452  	 * going to recurse back to the fs layer.
453  	 */
454  	handle->saved_alloc_context = memalloc_nofs_save();
455  	return 0;
456  }
457  
458  /* Allocate a new handle.  This should probably be in a slab... */
new_handle(int nblocks)459  static handle_t *new_handle(int nblocks)
460  {
461  	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
462  	if (!handle)
463  		return NULL;
464  	handle->h_total_credits = nblocks;
465  	handle->h_ref = 1;
466  
467  	return handle;
468  }
469  
jbd2__journal_start(journal_t * journal,int nblocks,int rsv_blocks,int revoke_records,gfp_t gfp_mask,unsigned int type,unsigned int line_no)470  handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
471  			      int revoke_records, gfp_t gfp_mask,
472  			      unsigned int type, unsigned int line_no)
473  {
474  	handle_t *handle = journal_current_handle();
475  	int err;
476  
477  	if (!journal)
478  		return ERR_PTR(-EROFS);
479  
480  	if (handle) {
481  		J_ASSERT(handle->h_transaction->t_journal == journal);
482  		handle->h_ref++;
483  		return handle;
484  	}
485  
486  	nblocks += DIV_ROUND_UP(revoke_records,
487  				journal->j_revoke_records_per_block);
488  	handle = new_handle(nblocks);
489  	if (!handle)
490  		return ERR_PTR(-ENOMEM);
491  	if (rsv_blocks) {
492  		handle_t *rsv_handle;
493  
494  		rsv_handle = new_handle(rsv_blocks);
495  		if (!rsv_handle) {
496  			jbd2_free_handle(handle);
497  			return ERR_PTR(-ENOMEM);
498  		}
499  		rsv_handle->h_reserved = 1;
500  		rsv_handle->h_journal = journal;
501  		handle->h_rsv_handle = rsv_handle;
502  	}
503  	handle->h_revoke_credits = revoke_records;
504  
505  	err = start_this_handle(journal, handle, gfp_mask);
506  	if (err < 0) {
507  		if (handle->h_rsv_handle)
508  			jbd2_free_handle(handle->h_rsv_handle);
509  		jbd2_free_handle(handle);
510  		return ERR_PTR(err);
511  	}
512  	handle->h_type = type;
513  	handle->h_line_no = line_no;
514  	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
515  				handle->h_transaction->t_tid, type,
516  				line_no, nblocks);
517  
518  	return handle;
519  }
520  EXPORT_SYMBOL(jbd2__journal_start);
521  
522  
523  /**
524   * jbd2_journal_start() - Obtain a new handle.
525   * @journal: Journal to start transaction on.
526   * @nblocks: number of block buffer we might modify
527   *
528   * We make sure that the transaction can guarantee at least nblocks of
529   * modified buffers in the log.  We block until the log can guarantee
530   * that much space. Additionally, if rsv_blocks > 0, we also create another
531   * handle with rsv_blocks reserved blocks in the journal. This handle is
532   * stored in h_rsv_handle. It is not attached to any particular transaction
533   * and thus doesn't block transaction commit. If the caller uses this reserved
534   * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
535   * on the parent handle will dispose the reserved one. Reserved handle has to
536   * be converted to a normal handle using jbd2_journal_start_reserved() before
537   * it can be used.
538   *
539   * Return a pointer to a newly allocated handle, or an ERR_PTR() value
540   * on failure.
541   */
jbd2_journal_start(journal_t * journal,int nblocks)542  handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
543  {
544  	return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
545  }
546  EXPORT_SYMBOL(jbd2_journal_start);
547  
__jbd2_journal_unreserve_handle(handle_t * handle,transaction_t * t)548  static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t)
549  {
550  	journal_t *journal = handle->h_journal;
551  
552  	WARN_ON(!handle->h_reserved);
553  	sub_reserved_credits(journal, handle->h_total_credits);
554  	if (t)
555  		atomic_sub(handle->h_total_credits, &t->t_outstanding_credits);
556  }
557  
jbd2_journal_free_reserved(handle_t * handle)558  void jbd2_journal_free_reserved(handle_t *handle)
559  {
560  	journal_t *journal = handle->h_journal;
561  
562  	/* Get j_state_lock to pin running transaction if it exists */
563  	read_lock(&journal->j_state_lock);
564  	__jbd2_journal_unreserve_handle(handle, journal->j_running_transaction);
565  	read_unlock(&journal->j_state_lock);
566  	jbd2_free_handle(handle);
567  }
568  EXPORT_SYMBOL(jbd2_journal_free_reserved);
569  
570  /**
571   * jbd2_journal_start_reserved() - start reserved handle
572   * @handle: handle to start
573   * @type: for handle statistics
574   * @line_no: for handle statistics
575   *
576   * Start handle that has been previously reserved with jbd2_journal_reserve().
577   * This attaches @handle to the running transaction (or creates one if there's
578   * not transaction running). Unlike jbd2_journal_start() this function cannot
579   * block on journal commit, checkpointing, or similar stuff. It can block on
580   * memory allocation or frozen journal though.
581   *
582   * Return 0 on success, non-zero on error - handle is freed in that case.
583   */
jbd2_journal_start_reserved(handle_t * handle,unsigned int type,unsigned int line_no)584  int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
585  				unsigned int line_no)
586  {
587  	journal_t *journal = handle->h_journal;
588  	int ret = -EIO;
589  
590  	if (WARN_ON(!handle->h_reserved)) {
591  		/* Someone passed in normal handle? Just stop it. */
592  		jbd2_journal_stop(handle);
593  		return ret;
594  	}
595  	/*
596  	 * Usefulness of mixing of reserved and unreserved handles is
597  	 * questionable. So far nobody seems to need it so just error out.
598  	 */
599  	if (WARN_ON(current->journal_info)) {
600  		jbd2_journal_free_reserved(handle);
601  		return ret;
602  	}
603  
604  	handle->h_journal = NULL;
605  	/*
606  	 * GFP_NOFS is here because callers are likely from writeback or
607  	 * similarly constrained call sites
608  	 */
609  	ret = start_this_handle(journal, handle, GFP_NOFS);
610  	if (ret < 0) {
611  		handle->h_journal = journal;
612  		jbd2_journal_free_reserved(handle);
613  		return ret;
614  	}
615  	handle->h_type = type;
616  	handle->h_line_no = line_no;
617  	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
618  				handle->h_transaction->t_tid, type,
619  				line_no, handle->h_total_credits);
620  	return 0;
621  }
622  EXPORT_SYMBOL(jbd2_journal_start_reserved);
623  
624  /**
625   * jbd2_journal_extend() - extend buffer credits.
626   * @handle:  handle to 'extend'
627   * @nblocks: nr blocks to try to extend by.
628   * @revoke_records: number of revoke records to try to extend by.
629   *
630   * Some transactions, such as large extends and truncates, can be done
631   * atomically all at once or in several stages.  The operation requests
632   * a credit for a number of buffer modifications in advance, but can
633   * extend its credit if it needs more.
634   *
635   * jbd2_journal_extend tries to give the running handle more buffer credits.
636   * It does not guarantee that allocation - this is a best-effort only.
637   * The calling process MUST be able to deal cleanly with a failure to
638   * extend here.
639   *
640   * Return 0 on success, non-zero on failure.
641   *
642   * return code < 0 implies an error
643   * return code > 0 implies normal transaction-full status.
644   */
jbd2_journal_extend(handle_t * handle,int nblocks,int revoke_records)645  int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
646  {
647  	transaction_t *transaction = handle->h_transaction;
648  	journal_t *journal;
649  	int result;
650  	int wanted;
651  
652  	if (is_handle_aborted(handle))
653  		return -EROFS;
654  	journal = transaction->t_journal;
655  
656  	result = 1;
657  
658  	read_lock(&journal->j_state_lock);
659  
660  	/* Don't extend a locked-down transaction! */
661  	if (transaction->t_state != T_RUNNING) {
662  		jbd2_debug(3, "denied handle %p %d blocks: "
663  			  "transaction not running\n", handle, nblocks);
664  		goto error_out;
665  	}
666  
667  	nblocks += DIV_ROUND_UP(
668  			handle->h_revoke_credits_requested + revoke_records,
669  			journal->j_revoke_records_per_block) -
670  		DIV_ROUND_UP(
671  			handle->h_revoke_credits_requested,
672  			journal->j_revoke_records_per_block);
673  	wanted = atomic_add_return(nblocks,
674  				   &transaction->t_outstanding_credits);
675  
676  	if (wanted > journal->j_max_transaction_buffers) {
677  		jbd2_debug(3, "denied handle %p %d blocks: "
678  			  "transaction too large\n", handle, nblocks);
679  		atomic_sub(nblocks, &transaction->t_outstanding_credits);
680  		goto error_out;
681  	}
682  
683  	trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
684  				 transaction->t_tid,
685  				 handle->h_type, handle->h_line_no,
686  				 handle->h_total_credits,
687  				 nblocks);
688  
689  	handle->h_total_credits += nblocks;
690  	handle->h_requested_credits += nblocks;
691  	handle->h_revoke_credits += revoke_records;
692  	handle->h_revoke_credits_requested += revoke_records;
693  	result = 0;
694  
695  	jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks);
696  error_out:
697  	read_unlock(&journal->j_state_lock);
698  	return result;
699  }
700  
stop_this_handle(handle_t * handle)701  static void stop_this_handle(handle_t *handle)
702  {
703  	transaction_t *transaction = handle->h_transaction;
704  	journal_t *journal = transaction->t_journal;
705  	int revokes;
706  
707  	J_ASSERT(journal_current_handle() == handle);
708  	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
709  	current->journal_info = NULL;
710  	/*
711  	 * Subtract necessary revoke descriptor blocks from handle credits. We
712  	 * take care to account only for revoke descriptor blocks the
713  	 * transaction will really need as large sequences of transactions with
714  	 * small numbers of revokes are relatively common.
715  	 */
716  	revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
717  	if (revokes) {
718  		int t_revokes, revoke_descriptors;
719  		int rr_per_blk = journal->j_revoke_records_per_block;
720  
721  		WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
722  				> handle->h_total_credits);
723  		t_revokes = atomic_add_return(revokes,
724  				&transaction->t_outstanding_revokes);
725  		revoke_descriptors =
726  			DIV_ROUND_UP(t_revokes, rr_per_blk) -
727  			DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
728  		handle->h_total_credits -= revoke_descriptors;
729  	}
730  	atomic_sub(handle->h_total_credits,
731  		   &transaction->t_outstanding_credits);
732  	if (handle->h_rsv_handle)
733  		__jbd2_journal_unreserve_handle(handle->h_rsv_handle,
734  						transaction);
735  	if (atomic_dec_and_test(&transaction->t_updates))
736  		wake_up(&journal->j_wait_updates);
737  
738  	rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
739  	/*
740  	 * Scope of the GFP_NOFS context is over here and so we can restore the
741  	 * original alloc context.
742  	 */
743  	memalloc_nofs_restore(handle->saved_alloc_context);
744  }
745  
746  /**
747   * jbd2__journal_restart() - restart a handle .
748   * @handle:  handle to restart
749   * @nblocks: nr credits requested
750   * @revoke_records: number of revoke record credits requested
751   * @gfp_mask: memory allocation flags (for start_this_handle)
752   *
753   * Restart a handle for a multi-transaction filesystem
754   * operation.
755   *
756   * If the jbd2_journal_extend() call above fails to grant new buffer credits
757   * to a running handle, a call to jbd2_journal_restart will commit the
758   * handle's transaction so far and reattach the handle to a new
759   * transaction capable of guaranteeing the requested number of
760   * credits. We preserve reserved handle if there's any attached to the
761   * passed in handle.
762   */
jbd2__journal_restart(handle_t * handle,int nblocks,int revoke_records,gfp_t gfp_mask)763  int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
764  			  gfp_t gfp_mask)
765  {
766  	transaction_t *transaction = handle->h_transaction;
767  	journal_t *journal;
768  	tid_t		tid;
769  	int		need_to_start;
770  	int		ret;
771  
772  	/* If we've had an abort of any type, don't even think about
773  	 * actually doing the restart! */
774  	if (is_handle_aborted(handle))
775  		return 0;
776  	journal = transaction->t_journal;
777  	tid = transaction->t_tid;
778  
779  	/*
780  	 * First unlink the handle from its current transaction, and start the
781  	 * commit on that.
782  	 */
783  	jbd2_debug(2, "restarting handle %p\n", handle);
784  	stop_this_handle(handle);
785  	handle->h_transaction = NULL;
786  
787  	/*
788  	 * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
789   	 * get rid of pointless j_state_lock traffic like this.
790  	 */
791  	read_lock(&journal->j_state_lock);
792  	need_to_start = !tid_geq(journal->j_commit_request, tid);
793  	read_unlock(&journal->j_state_lock);
794  	if (need_to_start)
795  		jbd2_log_start_commit(journal, tid);
796  	handle->h_total_credits = nblocks +
797  		DIV_ROUND_UP(revoke_records,
798  			     journal->j_revoke_records_per_block);
799  	handle->h_revoke_credits = revoke_records;
800  	ret = start_this_handle(journal, handle, gfp_mask);
801  	trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
802  				 ret ? 0 : handle->h_transaction->t_tid,
803  				 handle->h_type, handle->h_line_no,
804  				 handle->h_total_credits);
805  	return ret;
806  }
807  EXPORT_SYMBOL(jbd2__journal_restart);
808  
809  
jbd2_journal_restart(handle_t * handle,int nblocks)810  int jbd2_journal_restart(handle_t *handle, int nblocks)
811  {
812  	return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
813  }
814  EXPORT_SYMBOL(jbd2_journal_restart);
815  
816  /*
817   * Waits for any outstanding t_updates to finish.
818   * This is called with write j_state_lock held.
819   */
jbd2_journal_wait_updates(journal_t * journal)820  void jbd2_journal_wait_updates(journal_t *journal)
821  {
822  	DEFINE_WAIT(wait);
823  
824  	while (1) {
825  		/*
826  		 * Note that the running transaction can get freed under us if
827  		 * this transaction is getting committed in
828  		 * jbd2_journal_commit_transaction() ->
829  		 * jbd2_journal_free_transaction(). This can only happen when we
830  		 * release j_state_lock -> schedule() -> acquire j_state_lock.
831  		 * Hence we should everytime retrieve new j_running_transaction
832  		 * value (after j_state_lock release acquire cycle), else it may
833  		 * lead to use-after-free of old freed transaction.
834  		 */
835  		transaction_t *transaction = journal->j_running_transaction;
836  
837  		if (!transaction)
838  			break;
839  
840  		prepare_to_wait(&journal->j_wait_updates, &wait,
841  				TASK_UNINTERRUPTIBLE);
842  		if (!atomic_read(&transaction->t_updates)) {
843  			finish_wait(&journal->j_wait_updates, &wait);
844  			break;
845  		}
846  		write_unlock(&journal->j_state_lock);
847  		schedule();
848  		finish_wait(&journal->j_wait_updates, &wait);
849  		write_lock(&journal->j_state_lock);
850  	}
851  }
852  
853  /**
854   * jbd2_journal_lock_updates () - establish a transaction barrier.
855   * @journal:  Journal to establish a barrier on.
856   *
857   * This locks out any further updates from being started, and blocks
858   * until all existing updates have completed, returning only once the
859   * journal is in a quiescent state with no updates running.
860   *
861   * The journal lock should not be held on entry.
862   */
jbd2_journal_lock_updates(journal_t * journal)863  void jbd2_journal_lock_updates(journal_t *journal)
864  {
865  	jbd2_might_wait_for_commit(journal);
866  
867  	write_lock(&journal->j_state_lock);
868  	++journal->j_barrier_count;
869  
870  	/* Wait until there are no reserved handles */
871  	if (atomic_read(&journal->j_reserved_credits)) {
872  		write_unlock(&journal->j_state_lock);
873  		wait_event(journal->j_wait_reserved,
874  			   atomic_read(&journal->j_reserved_credits) == 0);
875  		write_lock(&journal->j_state_lock);
876  	}
877  
878  	/* Wait until there are no running t_updates */
879  	jbd2_journal_wait_updates(journal);
880  
881  	write_unlock(&journal->j_state_lock);
882  
883  	/*
884  	 * We have now established a barrier against other normal updates, but
885  	 * we also need to barrier against other jbd2_journal_lock_updates() calls
886  	 * to make sure that we serialise special journal-locked operations
887  	 * too.
888  	 */
889  	mutex_lock(&journal->j_barrier);
890  }
891  
892  /**
893   * jbd2_journal_unlock_updates () - release barrier
894   * @journal:  Journal to release the barrier on.
895   *
896   * Release a transaction barrier obtained with jbd2_journal_lock_updates().
897   *
898   * Should be called without the journal lock held.
899   */
jbd2_journal_unlock_updates(journal_t * journal)900  void jbd2_journal_unlock_updates (journal_t *journal)
901  {
902  	J_ASSERT(journal->j_barrier_count != 0);
903  
904  	mutex_unlock(&journal->j_barrier);
905  	write_lock(&journal->j_state_lock);
906  	--journal->j_barrier_count;
907  	write_unlock(&journal->j_state_lock);
908  	wake_up_all(&journal->j_wait_transaction_locked);
909  }
910  
warn_dirty_buffer(struct buffer_head * bh)911  static void warn_dirty_buffer(struct buffer_head *bh)
912  {
913  	printk(KERN_WARNING
914  	       "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
915  	       "There's a risk of filesystem corruption in case of system "
916  	       "crash.\n",
917  	       bh->b_bdev, (unsigned long long)bh->b_blocknr);
918  }
919  
920  /* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
jbd2_freeze_jh_data(struct journal_head * jh)921  static void jbd2_freeze_jh_data(struct journal_head *jh)
922  {
923  	char *source;
924  	struct buffer_head *bh = jh2bh(jh);
925  
926  	J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
927  	source = kmap_local_folio(bh->b_folio, bh_offset(bh));
928  	/* Fire data frozen trigger just before we copy the data */
929  	jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers);
930  	memcpy(jh->b_frozen_data, source, bh->b_size);
931  	kunmap_local(source);
932  
933  	/*
934  	 * Now that the frozen data is saved off, we need to store any matching
935  	 * triggers.
936  	 */
937  	jh->b_frozen_triggers = jh->b_triggers;
938  }
939  
940  /*
941   * If the buffer is already part of the current transaction, then there
942   * is nothing we need to do.  If it is already part of a prior
943   * transaction which we are still committing to disk, then we need to
944   * make sure that we do not overwrite the old copy: we do copy-out to
945   * preserve the copy going to disk.  We also account the buffer against
946   * the handle's metadata buffer credits (unless the buffer is already
947   * part of the transaction, that is).
948   *
949   */
950  static int
do_get_write_access(handle_t * handle,struct journal_head * jh,int force_copy)951  do_get_write_access(handle_t *handle, struct journal_head *jh,
952  			int force_copy)
953  {
954  	struct buffer_head *bh;
955  	transaction_t *transaction = handle->h_transaction;
956  	journal_t *journal;
957  	int error;
958  	char *frozen_buffer = NULL;
959  	unsigned long start_lock, time_lock;
960  
961  	journal = transaction->t_journal;
962  
963  	jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
964  
965  	JBUFFER_TRACE(jh, "entry");
966  repeat:
967  	bh = jh2bh(jh);
968  
969  	/* @@@ Need to check for errors here at some point. */
970  
971   	start_lock = jiffies;
972  	lock_buffer(bh);
973  	spin_lock(&jh->b_state_lock);
974  
975  	/* If it takes too long to lock the buffer, trace it */
976  	time_lock = jbd2_time_diff(start_lock, jiffies);
977  	if (time_lock > HZ/10)
978  		trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
979  			jiffies_to_msecs(time_lock));
980  
981  	/* We now hold the buffer lock so it is safe to query the buffer
982  	 * state.  Is the buffer dirty?
983  	 *
984  	 * If so, there are two possibilities.  The buffer may be
985  	 * non-journaled, and undergoing a quite legitimate writeback.
986  	 * Otherwise, it is journaled, and we don't expect dirty buffers
987  	 * in that state (the buffers should be marked JBD_Dirty
988  	 * instead.)  So either the IO is being done under our own
989  	 * control and this is a bug, or it's a third party IO such as
990  	 * dump(8) (which may leave the buffer scheduled for read ---
991  	 * ie. locked but not dirty) or tune2fs (which may actually have
992  	 * the buffer dirtied, ugh.)  */
993  
994  	if (buffer_dirty(bh) && jh->b_transaction) {
995  		warn_dirty_buffer(bh);
996  		/*
997  		 * We need to clean the dirty flag and we must do it under the
998  		 * buffer lock to be sure we don't race with running write-out.
999  		 */
1000  		JBUFFER_TRACE(jh, "Journalling dirty buffer");
1001  		clear_buffer_dirty(bh);
1002  		/*
1003  		 * The buffer is going to be added to BJ_Reserved list now and
1004  		 * nothing guarantees jbd2_journal_dirty_metadata() will be
1005  		 * ever called for it. So we need to set jbddirty bit here to
1006  		 * make sure the buffer is dirtied and written out when the
1007  		 * journaling machinery is done with it.
1008  		 */
1009  		set_buffer_jbddirty(bh);
1010  	}
1011  
1012  	error = -EROFS;
1013  	if (is_handle_aborted(handle)) {
1014  		spin_unlock(&jh->b_state_lock);
1015  		unlock_buffer(bh);
1016  		goto out;
1017  	}
1018  	error = 0;
1019  
1020  	/*
1021  	 * The buffer is already part of this transaction if b_transaction or
1022  	 * b_next_transaction points to it
1023  	 */
1024  	if (jh->b_transaction == transaction ||
1025  	    jh->b_next_transaction == transaction) {
1026  		unlock_buffer(bh);
1027  		goto done;
1028  	}
1029  
1030  	/*
1031  	 * this is the first time this transaction is touching this buffer,
1032  	 * reset the modified flag
1033  	 */
1034  	jh->b_modified = 0;
1035  
1036  	/*
1037  	 * If the buffer is not journaled right now, we need to make sure it
1038  	 * doesn't get written to disk before the caller actually commits the
1039  	 * new data
1040  	 */
1041  	if (!jh->b_transaction) {
1042  		JBUFFER_TRACE(jh, "no transaction");
1043  		J_ASSERT_JH(jh, !jh->b_next_transaction);
1044  		JBUFFER_TRACE(jh, "file as BJ_Reserved");
1045  		/*
1046  		 * Make sure all stores to jh (b_modified, b_frozen_data) are
1047  		 * visible before attaching it to the running transaction.
1048  		 * Paired with barrier in jbd2_write_access_granted()
1049  		 */
1050  		smp_wmb();
1051  		spin_lock(&journal->j_list_lock);
1052  		if (test_clear_buffer_dirty(bh)) {
1053  			/*
1054  			 * Execute buffer dirty clearing and jh->b_transaction
1055  			 * assignment under journal->j_list_lock locked to
1056  			 * prevent bh being removed from checkpoint list if
1057  			 * the buffer is in an intermediate state (not dirty
1058  			 * and jh->b_transaction is NULL).
1059  			 */
1060  			JBUFFER_TRACE(jh, "Journalling dirty buffer");
1061  			set_buffer_jbddirty(bh);
1062  		}
1063  		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1064  		spin_unlock(&journal->j_list_lock);
1065  		unlock_buffer(bh);
1066  		goto done;
1067  	}
1068  	unlock_buffer(bh);
1069  
1070  	/*
1071  	 * If there is already a copy-out version of this buffer, then we don't
1072  	 * need to make another one
1073  	 */
1074  	if (jh->b_frozen_data) {
1075  		JBUFFER_TRACE(jh, "has frozen data");
1076  		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1077  		goto attach_next;
1078  	}
1079  
1080  	JBUFFER_TRACE(jh, "owned by older transaction");
1081  	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1082  	J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);
1083  
1084  	/*
1085  	 * There is one case we have to be very careful about.  If the
1086  	 * committing transaction is currently writing this buffer out to disk
1087  	 * and has NOT made a copy-out, then we cannot modify the buffer
1088  	 * contents at all right now.  The essence of copy-out is that it is
1089  	 * the extra copy, not the primary copy, which gets journaled.  If the
1090  	 * primary copy is already going to disk then we cannot do copy-out
1091  	 * here.
1092  	 */
1093  	if (buffer_shadow(bh)) {
1094  		JBUFFER_TRACE(jh, "on shadow: sleep");
1095  		spin_unlock(&jh->b_state_lock);
1096  		wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
1097  		goto repeat;
1098  	}
1099  
1100  	/*
1101  	 * Only do the copy if the currently-owning transaction still needs it.
1102  	 * If buffer isn't on BJ_Metadata list, the committing transaction is
1103  	 * past that stage (here we use the fact that BH_Shadow is set under
1104  	 * bh_state lock together with refiling to BJ_Shadow list and at this
1105  	 * point we know the buffer doesn't have BH_Shadow set).
1106  	 *
1107  	 * Subtle point, though: if this is a get_undo_access, then we will be
1108  	 * relying on the frozen_data to contain the new value of the
1109  	 * committed_data record after the transaction, so we HAVE to force the
1110  	 * frozen_data copy in that case.
1111  	 */
1112  	if (jh->b_jlist == BJ_Metadata || force_copy) {
1113  		JBUFFER_TRACE(jh, "generate frozen data");
1114  		if (!frozen_buffer) {
1115  			JBUFFER_TRACE(jh, "allocate memory for buffer");
1116  			spin_unlock(&jh->b_state_lock);
1117  			frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
1118  						   GFP_NOFS | __GFP_NOFAIL);
1119  			goto repeat;
1120  		}
1121  		jh->b_frozen_data = frozen_buffer;
1122  		frozen_buffer = NULL;
1123  		jbd2_freeze_jh_data(jh);
1124  	}
1125  attach_next:
1126  	/*
1127  	 * Make sure all stores to jh (b_modified, b_frozen_data) are visible
1128  	 * before attaching it to the running transaction. Paired with barrier
1129  	 * in jbd2_write_access_granted()
1130  	 */
1131  	smp_wmb();
1132  	jh->b_next_transaction = transaction;
1133  
1134  done:
1135  	spin_unlock(&jh->b_state_lock);
1136  
1137  	/*
1138  	 * If we are about to journal a buffer, then any revoke pending on it is
1139  	 * no longer valid
1140  	 */
1141  	jbd2_journal_cancel_revoke(handle, jh);
1142  
1143  out:
1144  	if (unlikely(frozen_buffer))	/* It's usually NULL */
1145  		jbd2_free(frozen_buffer, bh->b_size);
1146  
1147  	JBUFFER_TRACE(jh, "exit");
1148  	return error;
1149  }
1150  
1151  /* Fast check whether buffer is already attached to the required transaction */
jbd2_write_access_granted(handle_t * handle,struct buffer_head * bh,bool undo)1152  static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
1153  							bool undo)
1154  {
1155  	struct journal_head *jh;
1156  	bool ret = false;
1157  
1158  	/* Dirty buffers require special handling... */
1159  	if (buffer_dirty(bh))
1160  		return false;
1161  
1162  	/*
1163  	 * RCU protects us from dereferencing freed pages. So the checks we do
1164  	 * are guaranteed not to oops. However the jh slab object can get freed
1165  	 * & reallocated while we work with it. So we have to be careful. When
1166  	 * we see jh attached to the running transaction, we know it must stay
1167  	 * so until the transaction is committed. Thus jh won't be freed and
1168  	 * will be attached to the same bh while we run.  However it can
1169  	 * happen jh gets freed, reallocated, and attached to the transaction
1170  	 * just after we get pointer to it from bh. So we have to be careful
1171  	 * and recheck jh still belongs to our bh before we return success.
1172  	 */
1173  	rcu_read_lock();
1174  	if (!buffer_jbd(bh))
1175  		goto out;
1176  	/* This should be bh2jh() but that doesn't work with inline functions */
1177  	jh = READ_ONCE(bh->b_private);
1178  	if (!jh)
1179  		goto out;
1180  	/* For undo access buffer must have data copied */
1181  	if (undo && !jh->b_committed_data)
1182  		goto out;
1183  	if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
1184  	    READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
1185  		goto out;
1186  	/*
1187  	 * There are two reasons for the barrier here:
1188  	 * 1) Make sure to fetch b_bh after we did previous checks so that we
1189  	 * detect when jh went through free, realloc, attach to transaction
1190  	 * while we were checking. Paired with implicit barrier in that path.
1191  	 * 2) So that access to bh done after jbd2_write_access_granted()
1192  	 * doesn't get reordered and see inconsistent state of concurrent
1193  	 * do_get_write_access().
1194  	 */
1195  	smp_mb();
1196  	if (unlikely(jh->b_bh != bh))
1197  		goto out;
1198  	ret = true;
1199  out:
1200  	rcu_read_unlock();
1201  	return ret;
1202  }
1203  
1204  /**
1205   * jbd2_journal_get_write_access() - notify intent to modify a buffer
1206   *				     for metadata (not data) update.
1207   * @handle: transaction to add buffer modifications to
1208   * @bh:     bh to be used for metadata writes
1209   *
1210   * Returns: error code or 0 on success.
1211   *
1212   * In full data journalling mode the buffer may be of type BJ_AsyncData,
1213   * because we're ``write()ing`` a buffer which is also part of a shared mapping.
1214   */
1215  
jbd2_journal_get_write_access(handle_t * handle,struct buffer_head * bh)1216  int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
1217  {
1218  	struct journal_head *jh;
1219  	journal_t *journal;
1220  	int rc;
1221  
1222  	if (is_handle_aborted(handle))
1223  		return -EROFS;
1224  
1225  	journal = handle->h_transaction->t_journal;
1226  	if (jbd2_check_fs_dev_write_error(journal)) {
1227  		/*
1228  		 * If the fs dev has writeback errors, it may have failed
1229  		 * to async write out metadata buffers in the background.
1230  		 * In this case, we could read old data from disk and write
1231  		 * it out again, which may lead to on-disk filesystem
1232  		 * inconsistency. Aborting journal can avoid it happen.
1233  		 */
1234  		jbd2_journal_abort(journal, -EIO);
1235  		return -EIO;
1236  	}
1237  
1238  	if (jbd2_write_access_granted(handle, bh, false))
1239  		return 0;
1240  
1241  	jh = jbd2_journal_add_journal_head(bh);
1242  	/* We do not want to get caught playing with fields which the
1243  	 * log thread also manipulates.  Make sure that the buffer
1244  	 * completes any outstanding IO before proceeding. */
1245  	rc = do_get_write_access(handle, jh, 0);
1246  	jbd2_journal_put_journal_head(jh);
1247  	return rc;
1248  }
1249  
1250  
1251  /*
1252   * When the user wants to journal a newly created buffer_head
1253   * (ie. getblk() returned a new buffer and we are going to populate it
1254   * manually rather than reading off disk), then we need to keep the
1255   * buffer_head locked until it has been completely filled with new
1256   * data.  In this case, we should be able to make the assertion that
1257   * the bh is not already part of an existing transaction.
1258   *
1259   * The buffer should already be locked by the caller by this point.
1260   * There is no lock ranking violation: it was a newly created,
1261   * unlocked buffer beforehand. */
1262  
1263  /**
1264   * jbd2_journal_get_create_access () - notify intent to use newly created bh
1265   * @handle: transaction to new buffer to
1266   * @bh: new buffer.
1267   *
1268   * Call this if you create a new bh.
1269   */
jbd2_journal_get_create_access(handle_t * handle,struct buffer_head * bh)1270  int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
1271  {
1272  	transaction_t *transaction = handle->h_transaction;
1273  	journal_t *journal;
1274  	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
1275  	int err;
1276  
1277  	jbd2_debug(5, "journal_head %p\n", jh);
1278  	err = -EROFS;
1279  	if (is_handle_aborted(handle))
1280  		goto out;
1281  	journal = transaction->t_journal;
1282  	err = 0;
1283  
1284  	JBUFFER_TRACE(jh, "entry");
1285  	/*
1286  	 * The buffer may already belong to this transaction due to pre-zeroing
1287  	 * in the filesystem's new_block code.  It may also be on the previous,
1288  	 * committing transaction's lists, but it HAS to be in Forget state in
1289  	 * that case: the transaction must have deleted the buffer for it to be
1290  	 * reused here.
1291  	 */
1292  	spin_lock(&jh->b_state_lock);
1293  	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
1294  		jh->b_transaction == NULL ||
1295  		(jh->b_transaction == journal->j_committing_transaction &&
1296  			  jh->b_jlist == BJ_Forget)));
1297  
1298  	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1299  	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
1300  
1301  	if (jh->b_transaction == NULL) {
1302  		/*
1303  		 * Previous jbd2_journal_forget() could have left the buffer
1304  		 * with jbddirty bit set because it was being committed. When
1305  		 * the commit finished, we've filed the buffer for
1306  		 * checkpointing and marked it dirty. Now we are reallocating
1307  		 * the buffer so the transaction freeing it must have
1308  		 * committed and so it's safe to clear the dirty bit.
1309  		 */
1310  		clear_buffer_dirty(jh2bh(jh));
1311  		/* first access by this transaction */
1312  		jh->b_modified = 0;
1313  
1314  		JBUFFER_TRACE(jh, "file as BJ_Reserved");
1315  		spin_lock(&journal->j_list_lock);
1316  		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1317  		spin_unlock(&journal->j_list_lock);
1318  	} else if (jh->b_transaction == journal->j_committing_transaction) {
1319  		/* first access by this transaction */
1320  		jh->b_modified = 0;
1321  
1322  		JBUFFER_TRACE(jh, "set next transaction");
1323  		spin_lock(&journal->j_list_lock);
1324  		jh->b_next_transaction = transaction;
1325  		spin_unlock(&journal->j_list_lock);
1326  	}
1327  	spin_unlock(&jh->b_state_lock);
1328  
1329  	/*
1330  	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
1331  	 * blocks which contain freed but then revoked metadata.  We need
1332  	 * to cancel the revoke in case we end up freeing it yet again
1333  	 * and the reallocating as data - this would cause a second revoke,
1334  	 * which hits an assertion error.
1335  	 */
1336  	JBUFFER_TRACE(jh, "cancelling revoke");
1337  	jbd2_journal_cancel_revoke(handle, jh);
1338  out:
1339  	jbd2_journal_put_journal_head(jh);
1340  	return err;
1341  }
1342  
1343  /**
1344   * jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
1345   *     non-rewindable consequences
1346   * @handle: transaction
1347   * @bh: buffer to undo
1348   *
1349   * Sometimes there is a need to distinguish between metadata which has
1350   * been committed to disk and that which has not.  The ext3fs code uses
1351   * this for freeing and allocating space, we have to make sure that we
1352   * do not reuse freed space until the deallocation has been committed,
1353   * since if we overwrote that space we would make the delete
1354   * un-rewindable in case of a crash.
1355   *
1356   * To deal with that, jbd2_journal_get_undo_access requests write access to a
1357   * buffer for parts of non-rewindable operations such as delete
1358   * operations on the bitmaps.  The journaling code must keep a copy of
1359   * the buffer's contents prior to the undo_access call until such time
1360   * as we know that the buffer has definitely been committed to disk.
1361   *
1362   * We never need to know which transaction the committed data is part
1363   * of, buffers touched here are guaranteed to be dirtied later and so
1364   * will be committed to a new transaction in due course, at which point
1365   * we can discard the old committed data pointer.
1366   *
1367   * Returns error number or 0 on success.
1368   */
jbd2_journal_get_undo_access(handle_t * handle,struct buffer_head * bh)1369  int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
1370  {
1371  	int err;
1372  	struct journal_head *jh;
1373  	char *committed_data = NULL;
1374  
1375  	if (is_handle_aborted(handle))
1376  		return -EROFS;
1377  
1378  	if (jbd2_write_access_granted(handle, bh, true))
1379  		return 0;
1380  
1381  	jh = jbd2_journal_add_journal_head(bh);
1382  	JBUFFER_TRACE(jh, "entry");
1383  
1384  	/*
1385  	 * Do this first --- it can drop the journal lock, so we want to
1386  	 * make sure that obtaining the committed_data is done
1387  	 * atomically wrt. completion of any outstanding commits.
1388  	 */
1389  	err = do_get_write_access(handle, jh, 1);
1390  	if (err)
1391  		goto out;
1392  
1393  repeat:
1394  	if (!jh->b_committed_data)
1395  		committed_data = jbd2_alloc(jh2bh(jh)->b_size,
1396  					    GFP_NOFS|__GFP_NOFAIL);
1397  
1398  	spin_lock(&jh->b_state_lock);
1399  	if (!jh->b_committed_data) {
1400  		/* Copy out the current buffer contents into the
1401  		 * preserved, committed copy. */
1402  		JBUFFER_TRACE(jh, "generate b_committed data");
1403  		if (!committed_data) {
1404  			spin_unlock(&jh->b_state_lock);
1405  			goto repeat;
1406  		}
1407  
1408  		jh->b_committed_data = committed_data;
1409  		committed_data = NULL;
1410  		memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
1411  	}
1412  	spin_unlock(&jh->b_state_lock);
1413  out:
1414  	jbd2_journal_put_journal_head(jh);
1415  	if (unlikely(committed_data))
1416  		jbd2_free(committed_data, bh->b_size);
1417  	return err;
1418  }
1419  
1420  /**
1421   * jbd2_journal_set_triggers() - Add triggers for commit writeout
1422   * @bh: buffer to trigger on
1423   * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
1424   *
1425   * Set any triggers on this journal_head.  This is always safe, because
1426   * triggers for a committing buffer will be saved off, and triggers for
1427   * a running transaction will match the buffer in that transaction.
1428   *
1429   * Call with NULL to clear the triggers.
1430   */
jbd2_journal_set_triggers(struct buffer_head * bh,struct jbd2_buffer_trigger_type * type)1431  void jbd2_journal_set_triggers(struct buffer_head *bh,
1432  			       struct jbd2_buffer_trigger_type *type)
1433  {
1434  	struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
1435  
1436  	if (WARN_ON_ONCE(!jh))
1437  		return;
1438  	jh->b_triggers = type;
1439  	jbd2_journal_put_journal_head(jh);
1440  }
1441  
jbd2_buffer_frozen_trigger(struct journal_head * jh,void * mapped_data,struct jbd2_buffer_trigger_type * triggers)1442  void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
1443  				struct jbd2_buffer_trigger_type *triggers)
1444  {
1445  	struct buffer_head *bh = jh2bh(jh);
1446  
1447  	if (!triggers || !triggers->t_frozen)
1448  		return;
1449  
1450  	triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
1451  }
1452  
jbd2_buffer_abort_trigger(struct journal_head * jh,struct jbd2_buffer_trigger_type * triggers)1453  void jbd2_buffer_abort_trigger(struct journal_head *jh,
1454  			       struct jbd2_buffer_trigger_type *triggers)
1455  {
1456  	if (!triggers || !triggers->t_abort)
1457  		return;
1458  
1459  	triggers->t_abort(triggers, jh2bh(jh));
1460  }
1461  
1462  /**
1463   * jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
1464   * @handle: transaction to add buffer to.
1465   * @bh: buffer to mark
1466   *
1467   * mark dirty metadata which needs to be journaled as part of the current
1468   * transaction.
1469   *
1470   * The buffer must have previously had jbd2_journal_get_write_access()
1471   * called so that it has a valid journal_head attached to the buffer
1472   * head.
1473   *
1474   * The buffer is placed on the transaction's metadata list and is marked
1475   * as belonging to the transaction.
1476   *
1477   * Returns error number or 0 on success.
1478   *
1479   * Special care needs to be taken if the buffer already belongs to the
1480   * current committing transaction (in which case we should have frozen
1481   * data present for that commit).  In that case, we don't relink the
1482   * buffer: that only gets done when the old transaction finally
1483   * completes its commit.
1484   */
jbd2_journal_dirty_metadata(handle_t * handle,struct buffer_head * bh)1485  int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1486  {
1487  	transaction_t *transaction = handle->h_transaction;
1488  	journal_t *journal;
1489  	struct journal_head *jh;
1490  	int ret = 0;
1491  
1492  	if (!buffer_jbd(bh))
1493  		return -EUCLEAN;
1494  
1495  	/*
1496  	 * We don't grab jh reference here since the buffer must be part
1497  	 * of the running transaction.
1498  	 */
1499  	jh = bh2jh(bh);
1500  	jbd2_debug(5, "journal_head %p\n", jh);
1501  	JBUFFER_TRACE(jh, "entry");
1502  
1503  	/*
1504  	 * This and the following assertions are unreliable since we may see jh
1505  	 * in inconsistent state unless we grab bh_state lock. But this is
1506  	 * crucial to catch bugs so let's do a reliable check until the
1507  	 * lockless handling is fully proven.
1508  	 */
1509  	if (data_race(jh->b_transaction != transaction &&
1510  	    jh->b_next_transaction != transaction)) {
1511  		spin_lock(&jh->b_state_lock);
1512  		J_ASSERT_JH(jh, jh->b_transaction == transaction ||
1513  				jh->b_next_transaction == transaction);
1514  		spin_unlock(&jh->b_state_lock);
1515  	}
1516  	if (jh->b_modified == 1) {
1517  		/* If it's in our transaction it must be in BJ_Metadata list. */
1518  		if (data_race(jh->b_transaction == transaction &&
1519  		    jh->b_jlist != BJ_Metadata)) {
1520  			spin_lock(&jh->b_state_lock);
1521  			if (jh->b_transaction == transaction &&
1522  			    jh->b_jlist != BJ_Metadata)
1523  				pr_err("JBD2: assertion failure: h_type=%u "
1524  				       "h_line_no=%u block_no=%llu jlist=%u\n",
1525  				       handle->h_type, handle->h_line_no,
1526  				       (unsigned long long) bh->b_blocknr,
1527  				       jh->b_jlist);
1528  			J_ASSERT_JH(jh, jh->b_transaction != transaction ||
1529  					jh->b_jlist == BJ_Metadata);
1530  			spin_unlock(&jh->b_state_lock);
1531  		}
1532  		goto out;
1533  	}
1534  
1535  	journal = transaction->t_journal;
1536  	spin_lock(&jh->b_state_lock);
1537  
1538  	if (is_handle_aborted(handle)) {
1539  		/*
1540  		 * Check journal aborting with @jh->b_state_lock locked,
1541  		 * since 'jh->b_transaction' could be replaced with
1542  		 * 'jh->b_next_transaction' during old transaction
1543  		 * committing if journal aborted, which may fail
1544  		 * assertion on 'jh->b_frozen_data == NULL'.
1545  		 */
1546  		ret = -EROFS;
1547  		goto out_unlock_bh;
1548  	}
1549  
1550  	if (jh->b_modified == 0) {
1551  		/*
1552  		 * This buffer's got modified and becoming part
1553  		 * of the transaction. This needs to be done
1554  		 * once a transaction -bzzz
1555  		 */
1556  		if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
1557  			ret = -ENOSPC;
1558  			goto out_unlock_bh;
1559  		}
1560  		jh->b_modified = 1;
1561  		handle->h_total_credits--;
1562  	}
1563  
1564  	/*
1565  	 * fastpath, to avoid expensive locking.  If this buffer is already
1566  	 * on the running transaction's metadata list there is nothing to do.
1567  	 * Nobody can take it off again because there is a handle open.
1568  	 * I _think_ we're OK here with SMP barriers - a mistaken decision will
1569  	 * result in this test being false, so we go in and take the locks.
1570  	 */
1571  	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1572  		JBUFFER_TRACE(jh, "fastpath");
1573  		if (unlikely(jh->b_transaction !=
1574  			     journal->j_running_transaction)) {
1575  			printk(KERN_ERR "JBD2: %s: "
1576  			       "jh->b_transaction (%llu, %p, %u) != "
1577  			       "journal->j_running_transaction (%p, %u)\n",
1578  			       journal->j_devname,
1579  			       (unsigned long long) bh->b_blocknr,
1580  			       jh->b_transaction,
1581  			       jh->b_transaction ? jh->b_transaction->t_tid : 0,
1582  			       journal->j_running_transaction,
1583  			       journal->j_running_transaction ?
1584  			       journal->j_running_transaction->t_tid : 0);
1585  			ret = -EINVAL;
1586  		}
1587  		goto out_unlock_bh;
1588  	}
1589  
1590  	set_buffer_jbddirty(bh);
1591  
1592  	/*
1593  	 * Metadata already on the current transaction list doesn't
1594  	 * need to be filed.  Metadata on another transaction's list must
1595  	 * be committing, and will be refiled once the commit completes:
1596  	 * leave it alone for now.
1597  	 */
1598  	if (jh->b_transaction != transaction) {
1599  		JBUFFER_TRACE(jh, "already on other transaction");
1600  		if (unlikely(((jh->b_transaction !=
1601  			       journal->j_committing_transaction)) ||
1602  			     (jh->b_next_transaction != transaction))) {
1603  			printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
1604  			       "bad jh for block %llu: "
1605  			       "transaction (%p, %u), "
1606  			       "jh->b_transaction (%p, %u), "
1607  			       "jh->b_next_transaction (%p, %u), jlist %u\n",
1608  			       journal->j_devname,
1609  			       (unsigned long long) bh->b_blocknr,
1610  			       transaction, transaction->t_tid,
1611  			       jh->b_transaction,
1612  			       jh->b_transaction ?
1613  			       jh->b_transaction->t_tid : 0,
1614  			       jh->b_next_transaction,
1615  			       jh->b_next_transaction ?
1616  			       jh->b_next_transaction->t_tid : 0,
1617  			       jh->b_jlist);
1618  			WARN_ON(1);
1619  			ret = -EINVAL;
1620  		}
1621  		/* And this case is illegal: we can't reuse another
1622  		 * transaction's data buffer, ever. */
1623  		goto out_unlock_bh;
1624  	}
1625  
1626  	/* That test should have eliminated the following case: */
1627  	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1628  
1629  	JBUFFER_TRACE(jh, "file as BJ_Metadata");
1630  	spin_lock(&journal->j_list_lock);
1631  	__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
1632  	spin_unlock(&journal->j_list_lock);
1633  out_unlock_bh:
1634  	spin_unlock(&jh->b_state_lock);
1635  out:
1636  	JBUFFER_TRACE(jh, "exit");
1637  	return ret;
1638  }
1639  
1640  /**
1641   * jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1642   * @handle: transaction handle
1643   * @bh:     bh to 'forget'
1644   *
1645   * We can only do the bforget if there are no commits pending against the
1646   * buffer.  If the buffer is dirty in the current running transaction we
1647   * can safely unlink it.
1648   *
1649   * bh may not be a journalled buffer at all - it may be a non-JBD
1650   * buffer which came off the hashtable.  Check for this.
1651   *
1652   * Decrements bh->b_count by one.
1653   *
1654   * Allow this call even if the handle has aborted --- it may be part of
1655   * the caller's cleanup after an abort.
1656   */
jbd2_journal_forget(handle_t * handle,struct buffer_head * bh)1657  int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
1658  {
1659  	transaction_t *transaction = handle->h_transaction;
1660  	journal_t *journal;
1661  	struct journal_head *jh;
1662  	int drop_reserve = 0;
1663  	int err = 0;
1664  	int was_modified = 0;
1665  
1666  	if (is_handle_aborted(handle))
1667  		return -EROFS;
1668  	journal = transaction->t_journal;
1669  
1670  	BUFFER_TRACE(bh, "entry");
1671  
1672  	jh = jbd2_journal_grab_journal_head(bh);
1673  	if (!jh) {
1674  		__bforget(bh);
1675  		return 0;
1676  	}
1677  
1678  	spin_lock(&jh->b_state_lock);
1679  
1680  	/* Critical error: attempting to delete a bitmap buffer, maybe?
1681  	 * Don't do any jbd operations, and return an error. */
1682  	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1683  			 "inconsistent data on disk")) {
1684  		err = -EIO;
1685  		goto drop;
1686  	}
1687  
1688  	/* keep track of whether or not this transaction modified us */
1689  	was_modified = jh->b_modified;
1690  
1691  	/*
1692  	 * The buffer's going from the transaction, we must drop
1693  	 * all references -bzzz
1694  	 */
1695  	jh->b_modified = 0;
1696  
1697  	if (jh->b_transaction == transaction) {
1698  		J_ASSERT_JH(jh, !jh->b_frozen_data);
1699  
1700  		/* If we are forgetting a buffer which is already part
1701  		 * of this transaction, then we can just drop it from
1702  		 * the transaction immediately. */
1703  		clear_buffer_dirty(bh);
1704  		clear_buffer_jbddirty(bh);
1705  
1706  		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1707  
1708  		/*
1709  		 * we only want to drop a reference if this transaction
1710  		 * modified the buffer
1711  		 */
1712  		if (was_modified)
1713  			drop_reserve = 1;
1714  
1715  		/*
1716  		 * We are no longer going to journal this buffer.
1717  		 * However, the commit of this transaction is still
1718  		 * important to the buffer: the delete that we are now
1719  		 * processing might obsolete an old log entry, so by
1720  		 * committing, we can satisfy the buffer's checkpoint.
1721  		 *
1722  		 * So, if we have a checkpoint on the buffer, we should
1723  		 * now refile the buffer on our BJ_Forget list so that
1724  		 * we know to remove the checkpoint after we commit.
1725  		 */
1726  
1727  		spin_lock(&journal->j_list_lock);
1728  		if (jh->b_cp_transaction) {
1729  			__jbd2_journal_temp_unlink_buffer(jh);
1730  			__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1731  		} else {
1732  			__jbd2_journal_unfile_buffer(jh);
1733  			jbd2_journal_put_journal_head(jh);
1734  		}
1735  		spin_unlock(&journal->j_list_lock);
1736  	} else if (jh->b_transaction) {
1737  		J_ASSERT_JH(jh, (jh->b_transaction ==
1738  				 journal->j_committing_transaction));
1739  		/* However, if the buffer is still owned by a prior
1740  		 * (committing) transaction, we can't drop it yet... */
1741  		JBUFFER_TRACE(jh, "belongs to older transaction");
1742  		/* ... but we CAN drop it from the new transaction through
1743  		 * marking the buffer as freed and set j_next_transaction to
1744  		 * the new transaction, so that not only the commit code
1745  		 * knows it should clear dirty bits when it is done with the
1746  		 * buffer, but also the buffer can be checkpointed only
1747  		 * after the new transaction commits. */
1748  
1749  		set_buffer_freed(bh);
1750  
1751  		if (!jh->b_next_transaction) {
1752  			spin_lock(&journal->j_list_lock);
1753  			jh->b_next_transaction = transaction;
1754  			spin_unlock(&journal->j_list_lock);
1755  		} else {
1756  			J_ASSERT(jh->b_next_transaction == transaction);
1757  
1758  			/*
1759  			 * only drop a reference if this transaction modified
1760  			 * the buffer
1761  			 */
1762  			if (was_modified)
1763  				drop_reserve = 1;
1764  		}
1765  	} else {
1766  		/*
1767  		 * Finally, if the buffer is not belongs to any
1768  		 * transaction, we can just drop it now if it has no
1769  		 * checkpoint.
1770  		 */
1771  		spin_lock(&journal->j_list_lock);
1772  		if (!jh->b_cp_transaction) {
1773  			JBUFFER_TRACE(jh, "belongs to none transaction");
1774  			spin_unlock(&journal->j_list_lock);
1775  			goto drop;
1776  		}
1777  
1778  		/*
1779  		 * Otherwise, if the buffer has been written to disk,
1780  		 * it is safe to remove the checkpoint and drop it.
1781  		 */
1782  		if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
1783  			spin_unlock(&journal->j_list_lock);
1784  			goto drop;
1785  		}
1786  
1787  		/*
1788  		 * The buffer is still not written to disk, we should
1789  		 * attach this buffer to current transaction so that the
1790  		 * buffer can be checkpointed only after the current
1791  		 * transaction commits.
1792  		 */
1793  		clear_buffer_dirty(bh);
1794  		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1795  		spin_unlock(&journal->j_list_lock);
1796  	}
1797  drop:
1798  	__brelse(bh);
1799  	spin_unlock(&jh->b_state_lock);
1800  	jbd2_journal_put_journal_head(jh);
1801  	if (drop_reserve) {
1802  		/* no need to reserve log space for this block -bzzz */
1803  		handle->h_total_credits++;
1804  	}
1805  	return err;
1806  }
1807  
1808  /**
1809   * jbd2_journal_stop() - complete a transaction
1810   * @handle: transaction to complete.
1811   *
1812   * All done for a particular handle.
1813   *
1814   * There is not much action needed here.  We just return any remaining
1815   * buffer credits to the transaction and remove the handle.  The only
1816   * complication is that we need to start a commit operation if the
1817   * filesystem is marked for synchronous update.
1818   *
1819   * jbd2_journal_stop itself will not usually return an error, but it may
1820   * do so in unusual circumstances.  In particular, expect it to
1821   * return -EIO if a jbd2_journal_abort has been executed since the
1822   * transaction began.
1823   */
jbd2_journal_stop(handle_t * handle)1824  int jbd2_journal_stop(handle_t *handle)
1825  {
1826  	transaction_t *transaction = handle->h_transaction;
1827  	journal_t *journal;
1828  	int err = 0, wait_for_commit = 0;
1829  	tid_t tid;
1830  	pid_t pid;
1831  
1832  	if (--handle->h_ref > 0) {
1833  		jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1834  						 handle->h_ref);
1835  		if (is_handle_aborted(handle))
1836  			return -EIO;
1837  		return 0;
1838  	}
1839  	if (!transaction) {
1840  		/*
1841  		 * Handle is already detached from the transaction so there is
1842  		 * nothing to do other than free the handle.
1843  		 */
1844  		memalloc_nofs_restore(handle->saved_alloc_context);
1845  		goto free_and_exit;
1846  	}
1847  	journal = transaction->t_journal;
1848  	tid = transaction->t_tid;
1849  
1850  	if (is_handle_aborted(handle))
1851  		err = -EIO;
1852  
1853  	jbd2_debug(4, "Handle %p going down\n", handle);
1854  	trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
1855  				tid, handle->h_type, handle->h_line_no,
1856  				jiffies - handle->h_start_jiffies,
1857  				handle->h_sync, handle->h_requested_credits,
1858  				(handle->h_requested_credits -
1859  				 handle->h_total_credits));
1860  
1861  	/*
1862  	 * Implement synchronous transaction batching.  If the handle
1863  	 * was synchronous, don't force a commit immediately.  Let's
1864  	 * yield and let another thread piggyback onto this
1865  	 * transaction.  Keep doing that while new threads continue to
1866  	 * arrive.  It doesn't cost much - we're about to run a commit
1867  	 * and sleep on IO anyway.  Speeds up many-threaded, many-dir
1868  	 * operations by 30x or more...
1869  	 *
1870  	 * We try and optimize the sleep time against what the
1871  	 * underlying disk can do, instead of having a static sleep
1872  	 * time.  This is useful for the case where our storage is so
1873  	 * fast that it is more optimal to go ahead and force a flush
1874  	 * and wait for the transaction to be committed than it is to
1875  	 * wait for an arbitrary amount of time for new writers to
1876  	 * join the transaction.  We achieve this by measuring how
1877  	 * long it takes to commit a transaction, and compare it with
1878  	 * how long this transaction has been running, and if run time
1879  	 * < commit time then we sleep for the delta and commit.  This
1880  	 * greatly helps super fast disks that would see slowdowns as
1881  	 * more threads started doing fsyncs.
1882  	 *
1883  	 * But don't do this if this process was the most recent one
1884  	 * to perform a synchronous write.  We do this to detect the
1885  	 * case where a single process is doing a stream of sync
1886  	 * writes.  No point in waiting for joiners in that case.
1887  	 *
1888  	 * Setting max_batch_time to 0 disables this completely.
1889  	 */
1890  	pid = current->pid;
1891  	if (handle->h_sync && journal->j_last_sync_writer != pid &&
1892  	    journal->j_max_batch_time) {
1893  		u64 commit_time, trans_time;
1894  
1895  		journal->j_last_sync_writer = pid;
1896  
1897  		read_lock(&journal->j_state_lock);
1898  		commit_time = journal->j_average_commit_time;
1899  		read_unlock(&journal->j_state_lock);
1900  
1901  		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1902  						   transaction->t_start_time));
1903  
1904  		commit_time = max_t(u64, commit_time,
1905  				    1000*journal->j_min_batch_time);
1906  		commit_time = min_t(u64, commit_time,
1907  				    1000*journal->j_max_batch_time);
1908  
1909  		if (trans_time < commit_time) {
1910  			ktime_t expires = ktime_add_ns(ktime_get(),
1911  						       commit_time);
1912  			set_current_state(TASK_UNINTERRUPTIBLE);
1913  			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1914  		}
1915  	}
1916  
1917  	if (handle->h_sync)
1918  		transaction->t_synchronous_commit = 1;
1919  
1920  	/*
1921  	 * If the handle is marked SYNC, we need to set another commit
1922  	 * going!  We also want to force a commit if the transaction is too
1923  	 * old now.
1924  	 */
1925  	if (handle->h_sync ||
1926  	    time_after_eq(jiffies, transaction->t_expires)) {
1927  		/* Do this even for aborted journals: an abort still
1928  		 * completes the commit thread, it just doesn't write
1929  		 * anything to disk. */
1930  
1931  		jbd2_debug(2, "transaction too old, requesting commit for "
1932  					"handle %p\n", handle);
1933  		/* This is non-blocking */
1934  		jbd2_log_start_commit(journal, tid);
1935  
1936  		/*
1937  		 * Special case: JBD2_SYNC synchronous updates require us
1938  		 * to wait for the commit to complete.
1939  		 */
1940  		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1941  			wait_for_commit = 1;
1942  	}
1943  
1944  	/*
1945  	 * Once stop_this_handle() drops t_updates, the transaction could start
1946  	 * committing on us and eventually disappear.  So we must not
1947  	 * dereference transaction pointer again after calling
1948  	 * stop_this_handle().
1949  	 */
1950  	stop_this_handle(handle);
1951  
1952  	if (wait_for_commit)
1953  		err = jbd2_log_wait_commit(journal, tid);
1954  
1955  free_and_exit:
1956  	if (handle->h_rsv_handle)
1957  		jbd2_free_handle(handle->h_rsv_handle);
1958  	jbd2_free_handle(handle);
1959  	return err;
1960  }
1961  
1962  /*
1963   *
1964   * List management code snippets: various functions for manipulating the
1965   * transaction buffer lists.
1966   *
1967   */
1968  
1969  /*
1970   * Append a buffer to a transaction list, given the transaction's list head
1971   * pointer.
1972   *
1973   * j_list_lock is held.
1974   *
1975   * jh->b_state_lock is held.
1976   */
1977  
1978  static inline void
__blist_add_buffer(struct journal_head ** list,struct journal_head * jh)1979  __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1980  {
1981  	if (!*list) {
1982  		jh->b_tnext = jh->b_tprev = jh;
1983  		*list = jh;
1984  	} else {
1985  		/* Insert at the tail of the list to preserve order */
1986  		struct journal_head *first = *list, *last = first->b_tprev;
1987  		jh->b_tprev = last;
1988  		jh->b_tnext = first;
1989  		last->b_tnext = first->b_tprev = jh;
1990  	}
1991  }
1992  
1993  /*
1994   * Remove a buffer from a transaction list, given the transaction's list
1995   * head pointer.
1996   *
1997   * Called with j_list_lock held, and the journal may not be locked.
1998   *
1999   * jh->b_state_lock is held.
2000   */
2001  
2002  static inline void
__blist_del_buffer(struct journal_head ** list,struct journal_head * jh)2003  __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
2004  {
2005  	if (*list == jh) {
2006  		*list = jh->b_tnext;
2007  		if (*list == jh)
2008  			*list = NULL;
2009  	}
2010  	jh->b_tprev->b_tnext = jh->b_tnext;
2011  	jh->b_tnext->b_tprev = jh->b_tprev;
2012  }
2013  
2014  /*
2015   * Remove a buffer from the appropriate transaction list.
2016   *
2017   * Note that this function can *change* the value of
2018   * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
2019   * t_reserved_list.  If the caller is holding onto a copy of one of these
2020   * pointers, it could go bad.  Generally the caller needs to re-read the
2021   * pointer from the transaction_t.
2022   *
2023   * Called under j_list_lock.
2024   */
__jbd2_journal_temp_unlink_buffer(struct journal_head * jh)2025  static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
2026  {
2027  	struct journal_head **list = NULL;
2028  	transaction_t *transaction;
2029  	struct buffer_head *bh = jh2bh(jh);
2030  
2031  	lockdep_assert_held(&jh->b_state_lock);
2032  	transaction = jh->b_transaction;
2033  	if (transaction)
2034  		assert_spin_locked(&transaction->t_journal->j_list_lock);
2035  
2036  	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2037  	if (jh->b_jlist != BJ_None)
2038  		J_ASSERT_JH(jh, transaction != NULL);
2039  
2040  	switch (jh->b_jlist) {
2041  	case BJ_None:
2042  		return;
2043  	case BJ_Metadata:
2044  		transaction->t_nr_buffers--;
2045  		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
2046  		list = &transaction->t_buffers;
2047  		break;
2048  	case BJ_Forget:
2049  		list = &transaction->t_forget;
2050  		break;
2051  	case BJ_Shadow:
2052  		list = &transaction->t_shadow_list;
2053  		break;
2054  	case BJ_Reserved:
2055  		list = &transaction->t_reserved_list;
2056  		break;
2057  	}
2058  
2059  	__blist_del_buffer(list, jh);
2060  	jh->b_jlist = BJ_None;
2061  	if (transaction && is_journal_aborted(transaction->t_journal))
2062  		clear_buffer_jbddirty(bh);
2063  	else if (test_clear_buffer_jbddirty(bh))
2064  		mark_buffer_dirty(bh);	/* Expose it to the VM */
2065  }
2066  
2067  /*
2068   * Remove buffer from all transactions. The caller is responsible for dropping
2069   * the jh reference that belonged to the transaction.
2070   *
2071   * Called with bh_state lock and j_list_lock
2072   */
__jbd2_journal_unfile_buffer(struct journal_head * jh)2073  static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
2074  {
2075  	J_ASSERT_JH(jh, jh->b_transaction != NULL);
2076  	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
2077  
2078  	__jbd2_journal_temp_unlink_buffer(jh);
2079  	jh->b_transaction = NULL;
2080  }
2081  
jbd2_journal_unfile_buffer(journal_t * journal,struct journal_head * jh)2082  void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
2083  {
2084  	struct buffer_head *bh = jh2bh(jh);
2085  
2086  	/* Get reference so that buffer cannot be freed before we unlock it */
2087  	get_bh(bh);
2088  	spin_lock(&jh->b_state_lock);
2089  	spin_lock(&journal->j_list_lock);
2090  	__jbd2_journal_unfile_buffer(jh);
2091  	spin_unlock(&journal->j_list_lock);
2092  	spin_unlock(&jh->b_state_lock);
2093  	jbd2_journal_put_journal_head(jh);
2094  	__brelse(bh);
2095  }
2096  
2097  /**
2098   * jbd2_journal_try_to_free_buffers() - try to free page buffers.
2099   * @journal: journal for operation
2100   * @folio: Folio to detach data from.
2101   *
2102   * For all the buffers on this page,
2103   * if they are fully written out ordered data, move them onto BUF_CLEAN
2104   * so try_to_free_buffers() can reap them.
2105   *
2106   * This function returns non-zero if we wish try_to_free_buffers()
2107   * to be called. We do this if the page is releasable by try_to_free_buffers().
2108   * We also do it if the page has locked or dirty buffers and the caller wants
2109   * us to perform sync or async writeout.
2110   *
2111   * This complicates JBD locking somewhat.  We aren't protected by the
2112   * BKL here.  We wish to remove the buffer from its committing or
2113   * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
2114   *
2115   * This may *change* the value of transaction_t->t_datalist, so anyone
2116   * who looks at t_datalist needs to lock against this function.
2117   *
2118   * Even worse, someone may be doing a jbd2_journal_dirty_data on this
2119   * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
2120   * will come out of the lock with the buffer dirty, which makes it
2121   * ineligible for release here.
2122   *
2123   * Who else is affected by this?  hmm...  Really the only contender
2124   * is do_get_write_access() - it could be looking at the buffer while
2125   * journal_try_to_free_buffer() is changing its state.  But that
2126   * cannot happen because we never reallocate freed data as metadata
2127   * while the data is part of a transaction.  Yes?
2128   *
2129   * Return false on failure, true on success
2130   */
jbd2_journal_try_to_free_buffers(journal_t * journal,struct folio * folio)2131  bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
2132  {
2133  	struct buffer_head *head;
2134  	struct buffer_head *bh;
2135  	bool ret = false;
2136  
2137  	J_ASSERT(folio_test_locked(folio));
2138  
2139  	head = folio_buffers(folio);
2140  	bh = head;
2141  	do {
2142  		struct journal_head *jh;
2143  
2144  		/*
2145  		 * We take our own ref against the journal_head here to avoid
2146  		 * having to add tons of locking around each instance of
2147  		 * jbd2_journal_put_journal_head().
2148  		 */
2149  		jh = jbd2_journal_grab_journal_head(bh);
2150  		if (!jh)
2151  			continue;
2152  
2153  		spin_lock(&jh->b_state_lock);
2154  		if (!jh->b_transaction && !jh->b_next_transaction) {
2155  			spin_lock(&journal->j_list_lock);
2156  			/* Remove written-back checkpointed metadata buffer */
2157  			if (jh->b_cp_transaction != NULL)
2158  				jbd2_journal_try_remove_checkpoint(jh);
2159  			spin_unlock(&journal->j_list_lock);
2160  		}
2161  		spin_unlock(&jh->b_state_lock);
2162  		jbd2_journal_put_journal_head(jh);
2163  		if (buffer_jbd(bh))
2164  			goto busy;
2165  	} while ((bh = bh->b_this_page) != head);
2166  
2167  	ret = try_to_free_buffers(folio);
2168  busy:
2169  	return ret;
2170  }
2171  
2172  /*
2173   * This buffer is no longer needed.  If it is on an older transaction's
2174   * checkpoint list we need to record it on this transaction's forget list
2175   * to pin this buffer (and hence its checkpointing transaction) down until
2176   * this transaction commits.  If the buffer isn't on a checkpoint list, we
2177   * release it.
2178   * Returns non-zero if JBD no longer has an interest in the buffer.
2179   *
2180   * Called under j_list_lock.
2181   *
2182   * Called under jh->b_state_lock.
2183   */
__dispose_buffer(struct journal_head * jh,transaction_t * transaction)2184  static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
2185  {
2186  	int may_free = 1;
2187  	struct buffer_head *bh = jh2bh(jh);
2188  
2189  	if (jh->b_cp_transaction) {
2190  		JBUFFER_TRACE(jh, "on running+cp transaction");
2191  		__jbd2_journal_temp_unlink_buffer(jh);
2192  		/*
2193  		 * We don't want to write the buffer anymore, clear the
2194  		 * bit so that we don't confuse checks in
2195  		 * __journal_file_buffer
2196  		 */
2197  		clear_buffer_dirty(bh);
2198  		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
2199  		may_free = 0;
2200  	} else {
2201  		JBUFFER_TRACE(jh, "on running transaction");
2202  		__jbd2_journal_unfile_buffer(jh);
2203  		jbd2_journal_put_journal_head(jh);
2204  	}
2205  	return may_free;
2206  }
2207  
2208  /*
2209   * jbd2_journal_invalidate_folio
2210   *
2211   * This code is tricky.  It has a number of cases to deal with.
2212   *
2213   * There are two invariants which this code relies on:
2214   *
2215   * i_size must be updated on disk before we start calling invalidate_folio
2216   * on the data.
2217   *
2218   *  This is done in ext3 by defining an ext3_setattr method which
2219   *  updates i_size before truncate gets going.  By maintaining this
2220   *  invariant, we can be sure that it is safe to throw away any buffers
2221   *  attached to the current transaction: once the transaction commits,
2222   *  we know that the data will not be needed.
2223   *
2224   *  Note however that we can *not* throw away data belonging to the
2225   *  previous, committing transaction!
2226   *
2227   * Any disk blocks which *are* part of the previous, committing
2228   * transaction (and which therefore cannot be discarded immediately) are
2229   * not going to be reused in the new running transaction
2230   *
2231   *  The bitmap committed_data images guarantee this: any block which is
2232   *  allocated in one transaction and removed in the next will be marked
2233   *  as in-use in the committed_data bitmap, so cannot be reused until
2234   *  the next transaction to delete the block commits.  This means that
2235   *  leaving committing buffers dirty is quite safe: the disk blocks
2236   *  cannot be reallocated to a different file and so buffer aliasing is
2237   *  not possible.
2238   *
2239   *
2240   * The above applies mainly to ordered data mode.  In writeback mode we
2241   * don't make guarantees about the order in which data hits disk --- in
2242   * particular we don't guarantee that new dirty data is flushed before
2243   * transaction commit --- so it is always safe just to discard data
2244   * immediately in that mode.  --sct
2245   */
2246  
2247  /*
2248   * The journal_unmap_buffer helper function returns zero if the buffer
2249   * concerned remains pinned as an anonymous buffer belonging to an older
2250   * transaction.
2251   *
2252   * We're outside-transaction here.  Either or both of j_running_transaction
2253   * and j_committing_transaction may be NULL.
2254   */
journal_unmap_buffer(journal_t * journal,struct buffer_head * bh,int partial_page)2255  static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
2256  				int partial_page)
2257  {
2258  	transaction_t *transaction;
2259  	struct journal_head *jh;
2260  	int may_free = 1;
2261  
2262  	BUFFER_TRACE(bh, "entry");
2263  
2264  	/*
2265  	 * It is safe to proceed here without the j_list_lock because the
2266  	 * buffers cannot be stolen by try_to_free_buffers as long as we are
2267  	 * holding the page lock. --sct
2268  	 */
2269  
2270  	jh = jbd2_journal_grab_journal_head(bh);
2271  	if (!jh)
2272  		goto zap_buffer_unlocked;
2273  
2274  	/* OK, we have data buffer in journaled mode */
2275  	write_lock(&journal->j_state_lock);
2276  	spin_lock(&jh->b_state_lock);
2277  	spin_lock(&journal->j_list_lock);
2278  
2279  	/*
2280  	 * We cannot remove the buffer from checkpoint lists until the
2281  	 * transaction adding inode to orphan list (let's call it T)
2282  	 * is committed.  Otherwise if the transaction changing the
2283  	 * buffer would be cleaned from the journal before T is
2284  	 * committed, a crash will cause that the correct contents of
2285  	 * the buffer will be lost.  On the other hand we have to
2286  	 * clear the buffer dirty bit at latest at the moment when the
2287  	 * transaction marking the buffer as freed in the filesystem
2288  	 * structures is committed because from that moment on the
2289  	 * block can be reallocated and used by a different page.
2290  	 * Since the block hasn't been freed yet but the inode has
2291  	 * already been added to orphan list, it is safe for us to add
2292  	 * the buffer to BJ_Forget list of the newest transaction.
2293  	 *
2294  	 * Also we have to clear buffer_mapped flag of a truncated buffer
2295  	 * because the buffer_head may be attached to the page straddling
2296  	 * i_size (can happen only when blocksize < pagesize) and thus the
2297  	 * buffer_head can be reused when the file is extended again. So we end
2298  	 * up keeping around invalidated buffers attached to transactions'
2299  	 * BJ_Forget list just to stop checkpointing code from cleaning up
2300  	 * the transaction this buffer was modified in.
2301  	 */
2302  	transaction = jh->b_transaction;
2303  	if (transaction == NULL) {
2304  		/* First case: not on any transaction.  If it
2305  		 * has no checkpoint link, then we can zap it:
2306  		 * it's a writeback-mode buffer so we don't care
2307  		 * if it hits disk safely. */
2308  		if (!jh->b_cp_transaction) {
2309  			JBUFFER_TRACE(jh, "not on any transaction: zap");
2310  			goto zap_buffer;
2311  		}
2312  
2313  		if (!buffer_dirty(bh)) {
2314  			/* bdflush has written it.  We can drop it now */
2315  			__jbd2_journal_remove_checkpoint(jh);
2316  			goto zap_buffer;
2317  		}
2318  
2319  		/* OK, it must be in the journal but still not
2320  		 * written fully to disk: it's metadata or
2321  		 * journaled data... */
2322  
2323  		if (journal->j_running_transaction) {
2324  			/* ... and once the current transaction has
2325  			 * committed, the buffer won't be needed any
2326  			 * longer. */
2327  			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
2328  			may_free = __dispose_buffer(jh,
2329  					journal->j_running_transaction);
2330  			goto zap_buffer;
2331  		} else {
2332  			/* There is no currently-running transaction. So the
2333  			 * orphan record which we wrote for this file must have
2334  			 * passed into commit.  We must attach this buffer to
2335  			 * the committing transaction, if it exists. */
2336  			if (journal->j_committing_transaction) {
2337  				JBUFFER_TRACE(jh, "give to committing trans");
2338  				may_free = __dispose_buffer(jh,
2339  					journal->j_committing_transaction);
2340  				goto zap_buffer;
2341  			} else {
2342  				/* The orphan record's transaction has
2343  				 * committed.  We can cleanse this buffer */
2344  				clear_buffer_jbddirty(bh);
2345  				__jbd2_journal_remove_checkpoint(jh);
2346  				goto zap_buffer;
2347  			}
2348  		}
2349  	} else if (transaction == journal->j_committing_transaction) {
2350  		JBUFFER_TRACE(jh, "on committing transaction");
2351  		/*
2352  		 * The buffer is committing, we simply cannot touch
2353  		 * it. If the page is straddling i_size we have to wait
2354  		 * for commit and try again.
2355  		 */
2356  		if (partial_page) {
2357  			spin_unlock(&journal->j_list_lock);
2358  			spin_unlock(&jh->b_state_lock);
2359  			write_unlock(&journal->j_state_lock);
2360  			jbd2_journal_put_journal_head(jh);
2361  			/* Already zapped buffer? Nothing to do... */
2362  			if (!bh->b_bdev)
2363  				return 0;
2364  			return -EBUSY;
2365  		}
2366  		/*
2367  		 * OK, buffer won't be reachable after truncate. We just clear
2368  		 * b_modified to not confuse transaction credit accounting, and
2369  		 * set j_next_transaction to the running transaction (if there
2370  		 * is one) and mark buffer as freed so that commit code knows
2371  		 * it should clear dirty bits when it is done with the buffer.
2372  		 */
2373  		set_buffer_freed(bh);
2374  		if (journal->j_running_transaction && buffer_jbddirty(bh))
2375  			jh->b_next_transaction = journal->j_running_transaction;
2376  		jh->b_modified = 0;
2377  		spin_unlock(&journal->j_list_lock);
2378  		spin_unlock(&jh->b_state_lock);
2379  		write_unlock(&journal->j_state_lock);
2380  		jbd2_journal_put_journal_head(jh);
2381  		return 0;
2382  	} else {
2383  		/* Good, the buffer belongs to the running transaction.
2384  		 * We are writing our own transaction's data, not any
2385  		 * previous one's, so it is safe to throw it away
2386  		 * (remember that we expect the filesystem to have set
2387  		 * i_size already for this truncate so recovery will not
2388  		 * expose the disk blocks we are discarding here.) */
2389  		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
2390  		JBUFFER_TRACE(jh, "on running transaction");
2391  		may_free = __dispose_buffer(jh, transaction);
2392  	}
2393  
2394  zap_buffer:
2395  	/*
2396  	 * This is tricky. Although the buffer is truncated, it may be reused
2397  	 * if blocksize < pagesize and it is attached to the page straddling
2398  	 * EOF. Since the buffer might have been added to BJ_Forget list of the
2399  	 * running transaction, journal_get_write_access() won't clear
2400  	 * b_modified and credit accounting gets confused. So clear b_modified
2401  	 * here.
2402  	 */
2403  	jh->b_modified = 0;
2404  	spin_unlock(&journal->j_list_lock);
2405  	spin_unlock(&jh->b_state_lock);
2406  	write_unlock(&journal->j_state_lock);
2407  	jbd2_journal_put_journal_head(jh);
2408  zap_buffer_unlocked:
2409  	clear_buffer_dirty(bh);
2410  	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
2411  	clear_buffer_mapped(bh);
2412  	clear_buffer_req(bh);
2413  	clear_buffer_new(bh);
2414  	clear_buffer_delay(bh);
2415  	clear_buffer_unwritten(bh);
2416  	bh->b_bdev = NULL;
2417  	return may_free;
2418  }
2419  
2420  /**
2421   * jbd2_journal_invalidate_folio()
2422   * @journal: journal to use for flush...
2423   * @folio:    folio to flush
2424   * @offset:  start of the range to invalidate
2425   * @length:  length of the range to invalidate
2426   *
2427   * Reap page buffers containing data after in the specified range in page.
2428   * Can return -EBUSY if buffers are part of the committing transaction and
2429   * the page is straddling i_size. Caller then has to wait for current commit
2430   * and try again.
2431   */
jbd2_journal_invalidate_folio(journal_t * journal,struct folio * folio,size_t offset,size_t length)2432  int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio,
2433  				size_t offset, size_t length)
2434  {
2435  	struct buffer_head *head, *bh, *next;
2436  	unsigned int stop = offset + length;
2437  	unsigned int curr_off = 0;
2438  	int partial_page = (offset || length < folio_size(folio));
2439  	int may_free = 1;
2440  	int ret = 0;
2441  
2442  	if (!folio_test_locked(folio))
2443  		BUG();
2444  	head = folio_buffers(folio);
2445  	if (!head)
2446  		return 0;
2447  
2448  	BUG_ON(stop > folio_size(folio) || stop < length);
2449  
2450  	/* We will potentially be playing with lists other than just the
2451  	 * data lists (especially for journaled data mode), so be
2452  	 * cautious in our locking. */
2453  
2454  	bh = head;
2455  	do {
2456  		unsigned int next_off = curr_off + bh->b_size;
2457  		next = bh->b_this_page;
2458  
2459  		if (next_off > stop)
2460  			return 0;
2461  
2462  		if (offset <= curr_off) {
2463  			/* This block is wholly outside the truncation point */
2464  			lock_buffer(bh);
2465  			ret = journal_unmap_buffer(journal, bh, partial_page);
2466  			unlock_buffer(bh);
2467  			if (ret < 0)
2468  				return ret;
2469  			may_free &= ret;
2470  		}
2471  		curr_off = next_off;
2472  		bh = next;
2473  
2474  	} while (bh != head);
2475  
2476  	if (!partial_page) {
2477  		if (may_free && try_to_free_buffers(folio))
2478  			J_ASSERT(!folio_buffers(folio));
2479  	}
2480  	return 0;
2481  }
2482  
2483  /*
2484   * File a buffer on the given transaction list.
2485   */
__jbd2_journal_file_buffer(struct journal_head * jh,transaction_t * transaction,int jlist)2486  void __jbd2_journal_file_buffer(struct journal_head *jh,
2487  			transaction_t *transaction, int jlist)
2488  {
2489  	struct journal_head **list = NULL;
2490  	int was_dirty = 0;
2491  	struct buffer_head *bh = jh2bh(jh);
2492  
2493  	lockdep_assert_held(&jh->b_state_lock);
2494  	assert_spin_locked(&transaction->t_journal->j_list_lock);
2495  
2496  	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2497  	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
2498  				jh->b_transaction == NULL);
2499  
2500  	if (jh->b_transaction && jh->b_jlist == jlist)
2501  		return;
2502  
2503  	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
2504  	    jlist == BJ_Shadow || jlist == BJ_Forget) {
2505  		/*
2506  		 * For metadata buffers, we track dirty bit in buffer_jbddirty
2507  		 * instead of buffer_dirty. We should not see a dirty bit set
2508  		 * here because we clear it in do_get_write_access but e.g.
2509  		 * tune2fs can modify the sb and set the dirty bit at any time
2510  		 * so we try to gracefully handle that.
2511  		 */
2512  		if (buffer_dirty(bh))
2513  			warn_dirty_buffer(bh);
2514  		if (test_clear_buffer_dirty(bh) ||
2515  		    test_clear_buffer_jbddirty(bh))
2516  			was_dirty = 1;
2517  	}
2518  
2519  	if (jh->b_transaction)
2520  		__jbd2_journal_temp_unlink_buffer(jh);
2521  	else
2522  		jbd2_journal_grab_journal_head(bh);
2523  	jh->b_transaction = transaction;
2524  
2525  	switch (jlist) {
2526  	case BJ_None:
2527  		J_ASSERT_JH(jh, !jh->b_committed_data);
2528  		J_ASSERT_JH(jh, !jh->b_frozen_data);
2529  		return;
2530  	case BJ_Metadata:
2531  		transaction->t_nr_buffers++;
2532  		list = &transaction->t_buffers;
2533  		break;
2534  	case BJ_Forget:
2535  		list = &transaction->t_forget;
2536  		break;
2537  	case BJ_Shadow:
2538  		list = &transaction->t_shadow_list;
2539  		break;
2540  	case BJ_Reserved:
2541  		list = &transaction->t_reserved_list;
2542  		break;
2543  	}
2544  
2545  	__blist_add_buffer(list, jh);
2546  	jh->b_jlist = jlist;
2547  
2548  	if (was_dirty)
2549  		set_buffer_jbddirty(bh);
2550  }
2551  
jbd2_journal_file_buffer(struct journal_head * jh,transaction_t * transaction,int jlist)2552  void jbd2_journal_file_buffer(struct journal_head *jh,
2553  				transaction_t *transaction, int jlist)
2554  {
2555  	spin_lock(&jh->b_state_lock);
2556  	spin_lock(&transaction->t_journal->j_list_lock);
2557  	__jbd2_journal_file_buffer(jh, transaction, jlist);
2558  	spin_unlock(&transaction->t_journal->j_list_lock);
2559  	spin_unlock(&jh->b_state_lock);
2560  }
2561  
2562  /*
2563   * Remove a buffer from its current buffer list in preparation for
2564   * dropping it from its current transaction entirely.  If the buffer has
2565   * already started to be used by a subsequent transaction, refile the
2566   * buffer on that transaction's metadata list.
2567   *
2568   * Called under j_list_lock
2569   * Called under jh->b_state_lock
2570   *
2571   * When this function returns true, there's no next transaction to refile to
2572   * and the caller has to drop jh reference through
2573   * jbd2_journal_put_journal_head().
2574   */
__jbd2_journal_refile_buffer(struct journal_head * jh)2575  bool __jbd2_journal_refile_buffer(struct journal_head *jh)
2576  {
2577  	int was_dirty, jlist;
2578  	struct buffer_head *bh = jh2bh(jh);
2579  
2580  	lockdep_assert_held(&jh->b_state_lock);
2581  	if (jh->b_transaction)
2582  		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2583  
2584  	/* If the buffer is now unused, just drop it. */
2585  	if (jh->b_next_transaction == NULL) {
2586  		__jbd2_journal_unfile_buffer(jh);
2587  		return true;
2588  	}
2589  
2590  	/*
2591  	 * It has been modified by a later transaction: add it to the new
2592  	 * transaction's metadata list.
2593  	 */
2594  
2595  	was_dirty = test_clear_buffer_jbddirty(bh);
2596  	__jbd2_journal_temp_unlink_buffer(jh);
2597  
2598  	/*
2599  	 * b_transaction must be set, otherwise the new b_transaction won't
2600  	 * be holding jh reference
2601  	 */
2602  	J_ASSERT_JH(jh, jh->b_transaction != NULL);
2603  
2604  	/*
2605  	 * We set b_transaction here because b_next_transaction will inherit
2606  	 * our jh reference and thus __jbd2_journal_file_buffer() must not
2607  	 * take a new one.
2608  	 */
2609  	WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
2610  	WRITE_ONCE(jh->b_next_transaction, NULL);
2611  	if (buffer_freed(bh))
2612  		jlist = BJ_Forget;
2613  	else if (jh->b_modified)
2614  		jlist = BJ_Metadata;
2615  	else
2616  		jlist = BJ_Reserved;
2617  	__jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
2618  	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2619  
2620  	if (was_dirty)
2621  		set_buffer_jbddirty(bh);
2622  	return false;
2623  }
2624  
2625  /*
2626   * __jbd2_journal_refile_buffer() with necessary locking added. We take our
2627   * bh reference so that we can safely unlock bh.
2628   *
2629   * The jh and bh may be freed by this call.
2630   */
jbd2_journal_refile_buffer(journal_t * journal,struct journal_head * jh)2631  void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2632  {
2633  	bool drop;
2634  
2635  	spin_lock(&jh->b_state_lock);
2636  	spin_lock(&journal->j_list_lock);
2637  	drop = __jbd2_journal_refile_buffer(jh);
2638  	spin_unlock(&jh->b_state_lock);
2639  	spin_unlock(&journal->j_list_lock);
2640  	if (drop)
2641  		jbd2_journal_put_journal_head(jh);
2642  }
2643  
2644  /*
2645   * File inode in the inode list of the handle's transaction
2646   */
jbd2_journal_file_inode(handle_t * handle,struct jbd2_inode * jinode,unsigned long flags,loff_t start_byte,loff_t end_byte)2647  static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
2648  		unsigned long flags, loff_t start_byte, loff_t end_byte)
2649  {
2650  	transaction_t *transaction = handle->h_transaction;
2651  	journal_t *journal;
2652  
2653  	if (is_handle_aborted(handle))
2654  		return -EROFS;
2655  	journal = transaction->t_journal;
2656  
2657  	jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2658  			transaction->t_tid);
2659  
2660  	spin_lock(&journal->j_list_lock);
2661  	jinode->i_flags |= flags;
2662  
2663  	if (jinode->i_dirty_end) {
2664  		jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
2665  		jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
2666  	} else {
2667  		jinode->i_dirty_start = start_byte;
2668  		jinode->i_dirty_end = end_byte;
2669  	}
2670  
2671  	/* Is inode already attached where we need it? */
2672  	if (jinode->i_transaction == transaction ||
2673  	    jinode->i_next_transaction == transaction)
2674  		goto done;
2675  
2676  	/*
2677  	 * We only ever set this variable to 1 so the test is safe. Since
2678  	 * t_need_data_flush is likely to be set, we do the test to save some
2679  	 * cacheline bouncing
2680  	 */
2681  	if (!transaction->t_need_data_flush)
2682  		transaction->t_need_data_flush = 1;
2683  	/* On some different transaction's list - should be
2684  	 * the committing one */
2685  	if (jinode->i_transaction) {
2686  		J_ASSERT(jinode->i_next_transaction == NULL);
2687  		J_ASSERT(jinode->i_transaction ==
2688  					journal->j_committing_transaction);
2689  		jinode->i_next_transaction = transaction;
2690  		goto done;
2691  	}
2692  	/* Not on any transaction list... */
2693  	J_ASSERT(!jinode->i_next_transaction);
2694  	jinode->i_transaction = transaction;
2695  	list_add(&jinode->i_list, &transaction->t_inode_list);
2696  done:
2697  	spin_unlock(&journal->j_list_lock);
2698  
2699  	return 0;
2700  }
2701  
jbd2_journal_inode_ranged_write(handle_t * handle,struct jbd2_inode * jinode,loff_t start_byte,loff_t length)2702  int jbd2_journal_inode_ranged_write(handle_t *handle,
2703  		struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
2704  {
2705  	return jbd2_journal_file_inode(handle, jinode,
2706  			JI_WRITE_DATA | JI_WAIT_DATA, start_byte,
2707  			start_byte + length - 1);
2708  }
2709  
jbd2_journal_inode_ranged_wait(handle_t * handle,struct jbd2_inode * jinode,loff_t start_byte,loff_t length)2710  int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode,
2711  		loff_t start_byte, loff_t length)
2712  {
2713  	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
2714  			start_byte, start_byte + length - 1);
2715  }
2716  
2717  /*
2718   * File truncate and transaction commit interact with each other in a
2719   * non-trivial way.  If a transaction writing data block A is
2720   * committing, we cannot discard the data by truncate until we have
2721   * written them.  Otherwise if we crashed after the transaction with
2722   * write has committed but before the transaction with truncate has
2723   * committed, we could see stale data in block A.  This function is a
2724   * helper to solve this problem.  It starts writeout of the truncated
2725   * part in case it is in the committing transaction.
2726   *
2727   * Filesystem code must call this function when inode is journaled in
2728   * ordered mode before truncation happens and after the inode has been
2729   * placed on orphan list with the new inode size. The second condition
2730   * avoids the race that someone writes new data and we start
2731   * committing the transaction after this function has been called but
2732   * before a transaction for truncate is started (and furthermore it
2733   * allows us to optimize the case where the addition to orphan list
2734   * happens in the same transaction as write --- we don't have to write
2735   * any data in such case).
2736   */
jbd2_journal_begin_ordered_truncate(journal_t * journal,struct jbd2_inode * jinode,loff_t new_size)2737  int jbd2_journal_begin_ordered_truncate(journal_t *journal,
2738  					struct jbd2_inode *jinode,
2739  					loff_t new_size)
2740  {
2741  	transaction_t *inode_trans, *commit_trans;
2742  	int ret = 0;
2743  
2744  	/* This is a quick check to avoid locking if not necessary */
2745  	if (!jinode->i_transaction)
2746  		goto out;
2747  	/* Locks are here just to force reading of recent values, it is
2748  	 * enough that the transaction was not committing before we started
2749  	 * a transaction adding the inode to orphan list */
2750  	read_lock(&journal->j_state_lock);
2751  	commit_trans = journal->j_committing_transaction;
2752  	read_unlock(&journal->j_state_lock);
2753  	spin_lock(&journal->j_list_lock);
2754  	inode_trans = jinode->i_transaction;
2755  	spin_unlock(&journal->j_list_lock);
2756  	if (inode_trans == commit_trans) {
2757  		ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
2758  			new_size, LLONG_MAX);
2759  		if (ret)
2760  			jbd2_journal_abort(journal, ret);
2761  	}
2762  out:
2763  	return ret;
2764  }
2765