1  // SPDX-License-Identifier: GPL-2.0
2  
3  /*
4   * fs/ext4/fast_commit.c
5   *
6   * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7   *
8   * Ext4 fast commits routines.
9   */
10  #include "ext4.h"
11  #include "ext4_jbd2.h"
12  #include "ext4_extents.h"
13  #include "mballoc.h"
14  
15  /*
16   * Ext4 Fast Commits
17   * -----------------
18   *
19   * Ext4 fast commits implement fine grained journalling for Ext4.
20   *
21   * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22   * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23   * TLV during the recovery phase. For the scenarios for which we currently
24   * don't have replay code, fast commit falls back to full commits.
25   * Fast commits record delta in one of the following three categories.
26   *
27   * (A) Directory entry updates:
28   *
29   * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30   * - EXT4_FC_TAG_LINK		- records directory entry link
31   * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32   *
33   * (B) File specific data range updates:
34   *
35   * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36   * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37   *
38   * (C) Inode metadata (mtime / ctime etc):
39   *
40   * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41   *				  during recovery. Note that iblocks field is
42   *				  not replayed and instead derived during
43   *				  replay.
44   * Commit Operation
45   * ----------------
46   * With fast commits, we maintain all the directory entry operations in the
47   * order in which they are issued in an in-memory queue. This queue is flushed
48   * to disk during the commit operation. We also maintain a list of inodes
49   * that need to be committed during a fast commit in another in memory queue of
50   * inodes. During the commit operation, we commit in the following order:
51   *
52   * [1] Lock inodes for any further data updates by setting COMMITTING state
53   * [2] Submit data buffers of all the inodes
54   * [3] Wait for [2] to complete
55   * [4] Commit all the directory entry updates in the fast commit space
56   * [5] Commit all the changed inode structures
57   * [6] Write tail tag (this tag ensures the atomicity, please read the following
58   *     section for more details).
59   * [7] Wait for [4], [5] and [6] to complete.
60   *
61   * All the inode updates must call ext4_fc_start_update() before starting an
62   * update. If such an ongoing update is present, fast commit waits for it to
63   * complete. The completion of such an update is marked by
64   * ext4_fc_stop_update().
65   *
66   * Fast Commit Ineligibility
67   * -------------------------
68   *
69   * Not all operations are supported by fast commits today (e.g extended
70   * attributes). Fast commit ineligibility is marked by calling
71   * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72   * to full commit.
73   *
74   * Atomicity of commits
75   * --------------------
76   * In order to guarantee atomicity during the commit operation, fast commit
77   * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78   * tag contains CRC of the contents and TID of the transaction after which
79   * this fast commit should be applied. Recovery code replays fast commit
80   * logs only if there's at least 1 valid tail present. For every fast commit
81   * operation, there is 1 tail. This means, we may end up with multiple tails
82   * in the fast commit space. Here's an example:
83   *
84   * - Create a new file A and remove existing file B
85   * - fsync()
86   * - Append contents to file A
87   * - Truncate file A
88   * - fsync()
89   *
90   * The fast commit space at the end of above operations would look like this:
91   *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92   *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93   *
94   * Replay code should thus check for all the valid tails in the FC area.
95   *
96   * Fast Commit Replay Idempotence
97   * ------------------------------
98   *
99   * Fast commits tags are idempotent in nature provided the recovery code follows
100   * certain rules. The guiding principle that the commit path follows while
101   * committing is that it stores the result of a particular operation instead of
102   * storing the procedure.
103   *
104   * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105   * was associated with inode 10. During fast commit, instead of storing this
106   * operation as a procedure "rename a to b", we store the resulting file system
107   * state as a "series" of outcomes:
108   *
109   * - Link dirent b to inode 10
110   * - Unlink dirent a
111   * - Inode <10> with valid refcount
112   *
113   * Now when recovery code runs, it needs "enforce" this state on the file
114   * system. This is what guarantees idempotence of fast commit replay.
115   *
116   * Let's take an example of a procedure that is not idempotent and see how fast
117   * commits make it idempotent. Consider following sequence of operations:
118   *
119   *     rm A;    mv B A;    read A
120   *  (x)     (y)        (z)
121   *
122   * (x), (y) and (z) are the points at which we can crash. If we store this
123   * sequence of operations as is then the replay is not idempotent. Let's say
124   * while in replay, we crash at (z). During the second replay, file A (which was
125   * actually created as a result of "mv B A" operation) would get deleted. Thus,
126   * file named A would be absent when we try to read A. So, this sequence of
127   * operations is not idempotent. However, as mentioned above, instead of storing
128   * the procedure fast commits store the outcome of each procedure. Thus the fast
129   * commit log for above procedure would be as follows:
130   *
131   * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132   * inode 11 before the replay)
133   *
134   *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135   * (w)          (x)                    (y)          (z)
136   *
137   * If we crash at (z), we will have file A linked to inode 11. During the second
138   * replay, we will remove file A (inode 11). But we will create it back and make
139   * it point to inode 11. We won't find B, so we'll just skip that step. At this
140   * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141   * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142   * similarly. Thus, by converting a non-idempotent procedure into a series of
143   * idempotent outcomes, fast commits ensured idempotence during the replay.
144   *
145   * TODOs
146   * -----
147   *
148   * 0) Fast commit replay path hardening: Fast commit replay code should use
149   *    journal handles to make sure all the updates it does during the replay
150   *    path are atomic. With that if we crash during fast commit replay, after
151   *    trying to do recovery again, we will find a file system where fast commit
152   *    area is invalid (because new full commit would be found). In order to deal
153   *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154   *    superblock state is persisted before starting the replay, so that after
155   *    the crash, fast commit recovery code can look at that flag and perform
156   *    fast commit recovery even if that area is invalidated by later full
157   *    commits.
158   *
159   * 1) Fast commit's commit path locks the entire file system during fast
160   *    commit. This has significant performance penalty. Instead of that, we
161   *    should use ext4_fc_start/stop_update functions to start inode level
162   *    updates from ext4_journal_start/stop. Once we do that we can drop file
163   *    system locking during commit path.
164   *
165   * 2) Handle more ineligible cases.
166   */
167  
168  #include <trace/events/ext4.h>
169  static struct kmem_cache *ext4_fc_dentry_cachep;
170  
ext4_end_buffer_io_sync(struct buffer_head * bh,int uptodate)171  static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172  {
173  	BUFFER_TRACE(bh, "");
174  	if (uptodate) {
175  		ext4_debug("%s: Block %lld up-to-date",
176  			   __func__, bh->b_blocknr);
177  		set_buffer_uptodate(bh);
178  	} else {
179  		ext4_debug("%s: Block %lld not up-to-date",
180  			   __func__, bh->b_blocknr);
181  		clear_buffer_uptodate(bh);
182  	}
183  
184  	unlock_buffer(bh);
185  }
186  
ext4_fc_reset_inode(struct inode * inode)187  static inline void ext4_fc_reset_inode(struct inode *inode)
188  {
189  	struct ext4_inode_info *ei = EXT4_I(inode);
190  
191  	ei->i_fc_lblk_start = 0;
192  	ei->i_fc_lblk_len = 0;
193  }
194  
ext4_fc_init_inode(struct inode * inode)195  void ext4_fc_init_inode(struct inode *inode)
196  {
197  	struct ext4_inode_info *ei = EXT4_I(inode);
198  
199  	ext4_fc_reset_inode(inode);
200  	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201  	INIT_LIST_HEAD(&ei->i_fc_list);
202  	INIT_LIST_HEAD(&ei->i_fc_dilist);
203  	init_waitqueue_head(&ei->i_fc_wait);
204  	atomic_set(&ei->i_fc_updates, 0);
205  }
206  
207  /* This function must be called with sbi->s_fc_lock held. */
ext4_fc_wait_committing_inode(struct inode * inode)208  static void ext4_fc_wait_committing_inode(struct inode *inode)
209  __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
210  {
211  	wait_queue_head_t *wq;
212  	struct ext4_inode_info *ei = EXT4_I(inode);
213  
214  #if (BITS_PER_LONG < 64)
215  	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
216  			EXT4_STATE_FC_COMMITTING);
217  	wq = bit_waitqueue(&ei->i_state_flags,
218  				EXT4_STATE_FC_COMMITTING);
219  #else
220  	DEFINE_WAIT_BIT(wait, &ei->i_flags,
221  			EXT4_STATE_FC_COMMITTING);
222  	wq = bit_waitqueue(&ei->i_flags,
223  				EXT4_STATE_FC_COMMITTING);
224  #endif
225  	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
226  	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
227  	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
228  	schedule();
229  	finish_wait(wq, &wait.wq_entry);
230  }
231  
ext4_fc_disabled(struct super_block * sb)232  static bool ext4_fc_disabled(struct super_block *sb)
233  {
234  	return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
235  		(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
236  }
237  
238  /*
239   * Inform Ext4's fast about start of an inode update
240   *
241   * This function is called by the high level call VFS callbacks before
242   * performing any inode update. This function blocks if there's an ongoing
243   * fast commit on the inode in question.
244   */
ext4_fc_start_update(struct inode * inode)245  void ext4_fc_start_update(struct inode *inode)
246  {
247  	struct ext4_inode_info *ei = EXT4_I(inode);
248  
249  	if (ext4_fc_disabled(inode->i_sb))
250  		return;
251  
252  restart:
253  	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
254  	if (list_empty(&ei->i_fc_list))
255  		goto out;
256  
257  	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
258  		ext4_fc_wait_committing_inode(inode);
259  		goto restart;
260  	}
261  out:
262  	atomic_inc(&ei->i_fc_updates);
263  	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
264  }
265  
266  /*
267   * Stop inode update and wake up waiting fast commits if any.
268   */
ext4_fc_stop_update(struct inode * inode)269  void ext4_fc_stop_update(struct inode *inode)
270  {
271  	struct ext4_inode_info *ei = EXT4_I(inode);
272  
273  	if (ext4_fc_disabled(inode->i_sb))
274  		return;
275  
276  	if (atomic_dec_and_test(&ei->i_fc_updates))
277  		wake_up_all(&ei->i_fc_wait);
278  }
279  
280  /*
281   * Remove inode from fast commit list. If the inode is being committed
282   * we wait until inode commit is done.
283   */
ext4_fc_del(struct inode * inode)284  void ext4_fc_del(struct inode *inode)
285  {
286  	struct ext4_inode_info *ei = EXT4_I(inode);
287  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
288  	struct ext4_fc_dentry_update *fc_dentry;
289  
290  	if (ext4_fc_disabled(inode->i_sb))
291  		return;
292  
293  restart:
294  	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
295  	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
296  		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
297  		return;
298  	}
299  
300  	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
301  		ext4_fc_wait_committing_inode(inode);
302  		goto restart;
303  	}
304  
305  	if (!list_empty(&ei->i_fc_list))
306  		list_del_init(&ei->i_fc_list);
307  
308  	/*
309  	 * Since this inode is getting removed, let's also remove all FC
310  	 * dentry create references, since it is not needed to log it anyways.
311  	 */
312  	if (list_empty(&ei->i_fc_dilist)) {
313  		spin_unlock(&sbi->s_fc_lock);
314  		return;
315  	}
316  
317  	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
318  	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
319  	list_del_init(&fc_dentry->fcd_list);
320  	list_del_init(&fc_dentry->fcd_dilist);
321  
322  	WARN_ON(!list_empty(&ei->i_fc_dilist));
323  	spin_unlock(&sbi->s_fc_lock);
324  
325  	if (fc_dentry->fcd_name.name &&
326  		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
327  		kfree(fc_dentry->fcd_name.name);
328  	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
329  
330  	return;
331  }
332  
333  /*
334   * Mark file system as fast commit ineligible, and record latest
335   * ineligible transaction tid. This means until the recorded
336   * transaction, commit operation would result in a full jbd2 commit.
337   */
ext4_fc_mark_ineligible(struct super_block * sb,int reason,handle_t * handle)338  void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
339  {
340  	struct ext4_sb_info *sbi = EXT4_SB(sb);
341  	tid_t tid;
342  	bool has_transaction = true;
343  	bool is_ineligible;
344  
345  	if (ext4_fc_disabled(sb))
346  		return;
347  
348  	if (handle && !IS_ERR(handle))
349  		tid = handle->h_transaction->t_tid;
350  	else {
351  		read_lock(&sbi->s_journal->j_state_lock);
352  		if (sbi->s_journal->j_running_transaction)
353  			tid = sbi->s_journal->j_running_transaction->t_tid;
354  		else
355  			has_transaction = false;
356  		read_unlock(&sbi->s_journal->j_state_lock);
357  	}
358  	spin_lock(&sbi->s_fc_lock);
359  	is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
360  	if (has_transaction &&
361  	    (!is_ineligible ||
362  	     (is_ineligible && tid_gt(tid, sbi->s_fc_ineligible_tid))))
363  		sbi->s_fc_ineligible_tid = tid;
364  	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
365  	spin_unlock(&sbi->s_fc_lock);
366  	WARN_ON(reason >= EXT4_FC_REASON_MAX);
367  	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
368  }
369  
370  /*
371   * Generic fast commit tracking function. If this is the first time this we are
372   * called after a full commit, we initialize fast commit fields and then call
373   * __fc_track_fn() with update = 0. If we have already been called after a full
374   * commit, we pass update = 1. Based on that, the track function can determine
375   * if it needs to track a field for the first time or if it needs to just
376   * update the previously tracked value.
377   *
378   * If enqueue is set, this function enqueues the inode in fast commit list.
379   */
ext4_fc_track_template(handle_t * handle,struct inode * inode,int (* __fc_track_fn)(handle_t * handle,struct inode *,void *,bool),void * args,int enqueue)380  static int ext4_fc_track_template(
381  	handle_t *handle, struct inode *inode,
382  	int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool),
383  	void *args, int enqueue)
384  {
385  	bool update = false;
386  	struct ext4_inode_info *ei = EXT4_I(inode);
387  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
388  	tid_t tid = 0;
389  	int ret;
390  
391  	tid = handle->h_transaction->t_tid;
392  	mutex_lock(&ei->i_fc_lock);
393  	if (tid == ei->i_sync_tid) {
394  		update = true;
395  	} else {
396  		ext4_fc_reset_inode(inode);
397  		ei->i_sync_tid = tid;
398  	}
399  	ret = __fc_track_fn(handle, inode, args, update);
400  	mutex_unlock(&ei->i_fc_lock);
401  
402  	if (!enqueue)
403  		return ret;
404  
405  	spin_lock(&sbi->s_fc_lock);
406  	if (list_empty(&EXT4_I(inode)->i_fc_list))
407  		list_add_tail(&EXT4_I(inode)->i_fc_list,
408  				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
409  				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
410  				&sbi->s_fc_q[FC_Q_STAGING] :
411  				&sbi->s_fc_q[FC_Q_MAIN]);
412  	spin_unlock(&sbi->s_fc_lock);
413  
414  	return ret;
415  }
416  
417  struct __track_dentry_update_args {
418  	struct dentry *dentry;
419  	int op;
420  };
421  
422  /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
__track_dentry_update(handle_t * handle,struct inode * inode,void * arg,bool update)423  static int __track_dentry_update(handle_t *handle, struct inode *inode,
424  				 void *arg, bool update)
425  {
426  	struct ext4_fc_dentry_update *node;
427  	struct ext4_inode_info *ei = EXT4_I(inode);
428  	struct __track_dentry_update_args *dentry_update =
429  		(struct __track_dentry_update_args *)arg;
430  	struct dentry *dentry = dentry_update->dentry;
431  	struct inode *dir = dentry->d_parent->d_inode;
432  	struct super_block *sb = inode->i_sb;
433  	struct ext4_sb_info *sbi = EXT4_SB(sb);
434  
435  	mutex_unlock(&ei->i_fc_lock);
436  
437  	if (IS_ENCRYPTED(dir)) {
438  		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
439  					handle);
440  		mutex_lock(&ei->i_fc_lock);
441  		return -EOPNOTSUPP;
442  	}
443  
444  	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
445  	if (!node) {
446  		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
447  		mutex_lock(&ei->i_fc_lock);
448  		return -ENOMEM;
449  	}
450  
451  	node->fcd_op = dentry_update->op;
452  	node->fcd_parent = dir->i_ino;
453  	node->fcd_ino = inode->i_ino;
454  	if (dentry->d_name.len > DNAME_INLINE_LEN) {
455  		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
456  		if (!node->fcd_name.name) {
457  			kmem_cache_free(ext4_fc_dentry_cachep, node);
458  			ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
459  			mutex_lock(&ei->i_fc_lock);
460  			return -ENOMEM;
461  		}
462  		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
463  			dentry->d_name.len);
464  	} else {
465  		memcpy(node->fcd_iname, dentry->d_name.name,
466  			dentry->d_name.len);
467  		node->fcd_name.name = node->fcd_iname;
468  	}
469  	node->fcd_name.len = dentry->d_name.len;
470  	INIT_LIST_HEAD(&node->fcd_dilist);
471  	spin_lock(&sbi->s_fc_lock);
472  	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
473  		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
474  		list_add_tail(&node->fcd_list,
475  				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
476  	else
477  		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
478  
479  	/*
480  	 * This helps us keep a track of all fc_dentry updates which is part of
481  	 * this ext4 inode. So in case the inode is getting unlinked, before
482  	 * even we get a chance to fsync, we could remove all fc_dentry
483  	 * references while evicting the inode in ext4_fc_del().
484  	 * Also with this, we don't need to loop over all the inodes in
485  	 * sbi->s_fc_q to get the corresponding inode in
486  	 * ext4_fc_commit_dentry_updates().
487  	 */
488  	if (dentry_update->op == EXT4_FC_TAG_CREAT) {
489  		WARN_ON(!list_empty(&ei->i_fc_dilist));
490  		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
491  	}
492  	spin_unlock(&sbi->s_fc_lock);
493  	mutex_lock(&ei->i_fc_lock);
494  
495  	return 0;
496  }
497  
__ext4_fc_track_unlink(handle_t * handle,struct inode * inode,struct dentry * dentry)498  void __ext4_fc_track_unlink(handle_t *handle,
499  		struct inode *inode, struct dentry *dentry)
500  {
501  	struct __track_dentry_update_args args;
502  	int ret;
503  
504  	args.dentry = dentry;
505  	args.op = EXT4_FC_TAG_UNLINK;
506  
507  	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
508  					(void *)&args, 0);
509  	trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
510  }
511  
ext4_fc_track_unlink(handle_t * handle,struct dentry * dentry)512  void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
513  {
514  	struct inode *inode = d_inode(dentry);
515  
516  	if (ext4_fc_disabled(inode->i_sb))
517  		return;
518  
519  	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
520  		return;
521  
522  	__ext4_fc_track_unlink(handle, inode, dentry);
523  }
524  
__ext4_fc_track_link(handle_t * handle,struct inode * inode,struct dentry * dentry)525  void __ext4_fc_track_link(handle_t *handle,
526  	struct inode *inode, struct dentry *dentry)
527  {
528  	struct __track_dentry_update_args args;
529  	int ret;
530  
531  	args.dentry = dentry;
532  	args.op = EXT4_FC_TAG_LINK;
533  
534  	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
535  					(void *)&args, 0);
536  	trace_ext4_fc_track_link(handle, inode, dentry, ret);
537  }
538  
ext4_fc_track_link(handle_t * handle,struct dentry * dentry)539  void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
540  {
541  	struct inode *inode = d_inode(dentry);
542  
543  	if (ext4_fc_disabled(inode->i_sb))
544  		return;
545  
546  	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
547  		return;
548  
549  	__ext4_fc_track_link(handle, inode, dentry);
550  }
551  
__ext4_fc_track_create(handle_t * handle,struct inode * inode,struct dentry * dentry)552  void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
553  			  struct dentry *dentry)
554  {
555  	struct __track_dentry_update_args args;
556  	int ret;
557  
558  	args.dentry = dentry;
559  	args.op = EXT4_FC_TAG_CREAT;
560  
561  	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
562  					(void *)&args, 0);
563  	trace_ext4_fc_track_create(handle, inode, dentry, ret);
564  }
565  
ext4_fc_track_create(handle_t * handle,struct dentry * dentry)566  void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
567  {
568  	struct inode *inode = d_inode(dentry);
569  
570  	if (ext4_fc_disabled(inode->i_sb))
571  		return;
572  
573  	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
574  		return;
575  
576  	__ext4_fc_track_create(handle, inode, dentry);
577  }
578  
579  /* __track_fn for inode tracking */
__track_inode(handle_t * handle,struct inode * inode,void * arg,bool update)580  static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
581  			 bool update)
582  {
583  	if (update)
584  		return -EEXIST;
585  
586  	EXT4_I(inode)->i_fc_lblk_len = 0;
587  
588  	return 0;
589  }
590  
ext4_fc_track_inode(handle_t * handle,struct inode * inode)591  void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
592  {
593  	int ret;
594  
595  	if (S_ISDIR(inode->i_mode))
596  		return;
597  
598  	if (ext4_fc_disabled(inode->i_sb))
599  		return;
600  
601  	if (ext4_should_journal_data(inode)) {
602  		ext4_fc_mark_ineligible(inode->i_sb,
603  					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
604  		return;
605  	}
606  
607  	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
608  		return;
609  
610  	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
611  	trace_ext4_fc_track_inode(handle, inode, ret);
612  }
613  
614  struct __track_range_args {
615  	ext4_lblk_t start, end;
616  };
617  
618  /* __track_fn for tracking data updates */
__track_range(handle_t * handle,struct inode * inode,void * arg,bool update)619  static int __track_range(handle_t *handle, struct inode *inode, void *arg,
620  			 bool update)
621  {
622  	struct ext4_inode_info *ei = EXT4_I(inode);
623  	ext4_lblk_t oldstart;
624  	struct __track_range_args *__arg =
625  		(struct __track_range_args *)arg;
626  
627  	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
628  		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
629  		return -ECANCELED;
630  	}
631  
632  	oldstart = ei->i_fc_lblk_start;
633  
634  	if (update && ei->i_fc_lblk_len > 0) {
635  		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
636  		ei->i_fc_lblk_len =
637  			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
638  				ei->i_fc_lblk_start + 1;
639  	} else {
640  		ei->i_fc_lblk_start = __arg->start;
641  		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
642  	}
643  
644  	return 0;
645  }
646  
ext4_fc_track_range(handle_t * handle,struct inode * inode,ext4_lblk_t start,ext4_lblk_t end)647  void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
648  			 ext4_lblk_t end)
649  {
650  	struct __track_range_args args;
651  	int ret;
652  
653  	if (S_ISDIR(inode->i_mode))
654  		return;
655  
656  	if (ext4_fc_disabled(inode->i_sb))
657  		return;
658  
659  	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
660  		return;
661  
662  	if (ext4_has_inline_data(inode)) {
663  		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
664  					handle);
665  		return;
666  	}
667  
668  	args.start = start;
669  	args.end = end;
670  
671  	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
672  
673  	trace_ext4_fc_track_range(handle, inode, start, end, ret);
674  }
675  
ext4_fc_submit_bh(struct super_block * sb,bool is_tail)676  static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
677  {
678  	blk_opf_t write_flags = REQ_SYNC;
679  	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
680  
681  	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
682  	if (test_opt(sb, BARRIER) && is_tail)
683  		write_flags |= REQ_FUA | REQ_PREFLUSH;
684  	lock_buffer(bh);
685  	set_buffer_dirty(bh);
686  	set_buffer_uptodate(bh);
687  	bh->b_end_io = ext4_end_buffer_io_sync;
688  	submit_bh(REQ_OP_WRITE | write_flags, bh);
689  	EXT4_SB(sb)->s_fc_bh = NULL;
690  }
691  
692  /* Ext4 commit path routines */
693  
694  /*
695   * Allocate len bytes on a fast commit buffer.
696   *
697   * During the commit time this function is used to manage fast commit
698   * block space. We don't split a fast commit log onto different
699   * blocks. So this function makes sure that if there's not enough space
700   * on the current block, the remaining space in the current block is
701   * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
702   * new block is from jbd2 and CRC is updated to reflect the padding
703   * we added.
704   */
ext4_fc_reserve_space(struct super_block * sb,int len,u32 * crc)705  static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
706  {
707  	struct ext4_fc_tl tl;
708  	struct ext4_sb_info *sbi = EXT4_SB(sb);
709  	struct buffer_head *bh;
710  	int bsize = sbi->s_journal->j_blocksize;
711  	int ret, off = sbi->s_fc_bytes % bsize;
712  	int remaining;
713  	u8 *dst;
714  
715  	/*
716  	 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
717  	 * cannot fulfill the request.
718  	 */
719  	if (len > bsize - EXT4_FC_TAG_BASE_LEN)
720  		return NULL;
721  
722  	if (!sbi->s_fc_bh) {
723  		ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
724  		if (ret)
725  			return NULL;
726  		sbi->s_fc_bh = bh;
727  	}
728  	dst = sbi->s_fc_bh->b_data + off;
729  
730  	/*
731  	 * Allocate the bytes in the current block if we can do so while still
732  	 * leaving enough space for a PAD tlv.
733  	 */
734  	remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
735  	if (len <= remaining) {
736  		sbi->s_fc_bytes += len;
737  		return dst;
738  	}
739  
740  	/*
741  	 * Else, terminate the current block with a PAD tlv, then allocate a new
742  	 * block and allocate the bytes at the start of that new block.
743  	 */
744  
745  	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
746  	tl.fc_len = cpu_to_le16(remaining);
747  	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
748  	memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
749  	*crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
750  
751  	ext4_fc_submit_bh(sb, false);
752  
753  	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
754  	if (ret)
755  		return NULL;
756  	sbi->s_fc_bh = bh;
757  	sbi->s_fc_bytes += bsize - off + len;
758  	return sbi->s_fc_bh->b_data;
759  }
760  
761  /*
762   * Complete a fast commit by writing tail tag.
763   *
764   * Writing tail tag marks the end of a fast commit. In order to guarantee
765   * atomicity, after writing tail tag, even if there's space remaining
766   * in the block, next commit shouldn't use it. That's why tail tag
767   * has the length as that of the remaining space on the block.
768   */
ext4_fc_write_tail(struct super_block * sb,u32 crc)769  static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
770  {
771  	struct ext4_sb_info *sbi = EXT4_SB(sb);
772  	struct ext4_fc_tl tl;
773  	struct ext4_fc_tail tail;
774  	int off, bsize = sbi->s_journal->j_blocksize;
775  	u8 *dst;
776  
777  	/*
778  	 * ext4_fc_reserve_space takes care of allocating an extra block if
779  	 * there's no enough space on this block for accommodating this tail.
780  	 */
781  	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
782  	if (!dst)
783  		return -ENOSPC;
784  
785  	off = sbi->s_fc_bytes % bsize;
786  
787  	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
788  	tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
789  	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
790  
791  	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
792  	dst += EXT4_FC_TAG_BASE_LEN;
793  	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
794  	memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
795  	dst += sizeof(tail.fc_tid);
796  	crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
797  			  dst - (u8 *)sbi->s_fc_bh->b_data);
798  	tail.fc_crc = cpu_to_le32(crc);
799  	memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
800  	dst += sizeof(tail.fc_crc);
801  	memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
802  
803  	ext4_fc_submit_bh(sb, true);
804  
805  	return 0;
806  }
807  
808  /*
809   * Adds tag, length, value and updates CRC. Returns true if tlv was added.
810   * Returns false if there's not enough space.
811   */
ext4_fc_add_tlv(struct super_block * sb,u16 tag,u16 len,u8 * val,u32 * crc)812  static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
813  			   u32 *crc)
814  {
815  	struct ext4_fc_tl tl;
816  	u8 *dst;
817  
818  	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
819  	if (!dst)
820  		return false;
821  
822  	tl.fc_tag = cpu_to_le16(tag);
823  	tl.fc_len = cpu_to_le16(len);
824  
825  	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
826  	memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
827  
828  	return true;
829  }
830  
831  /* Same as above, but adds dentry tlv. */
ext4_fc_add_dentry_tlv(struct super_block * sb,u32 * crc,struct ext4_fc_dentry_update * fc_dentry)832  static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
833  				   struct ext4_fc_dentry_update *fc_dentry)
834  {
835  	struct ext4_fc_dentry_info fcd;
836  	struct ext4_fc_tl tl;
837  	int dlen = fc_dentry->fcd_name.len;
838  	u8 *dst = ext4_fc_reserve_space(sb,
839  			EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
840  
841  	if (!dst)
842  		return false;
843  
844  	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
845  	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
846  	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
847  	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
848  	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
849  	dst += EXT4_FC_TAG_BASE_LEN;
850  	memcpy(dst, &fcd, sizeof(fcd));
851  	dst += sizeof(fcd);
852  	memcpy(dst, fc_dentry->fcd_name.name, dlen);
853  
854  	return true;
855  }
856  
857  /*
858   * Writes inode in the fast commit space under TLV with tag @tag.
859   * Returns 0 on success, error on failure.
860   */
ext4_fc_write_inode(struct inode * inode,u32 * crc)861  static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
862  {
863  	struct ext4_inode_info *ei = EXT4_I(inode);
864  	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
865  	int ret;
866  	struct ext4_iloc iloc;
867  	struct ext4_fc_inode fc_inode;
868  	struct ext4_fc_tl tl;
869  	u8 *dst;
870  
871  	ret = ext4_get_inode_loc(inode, &iloc);
872  	if (ret)
873  		return ret;
874  
875  	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
876  		inode_len = EXT4_INODE_SIZE(inode->i_sb);
877  	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
878  		inode_len += ei->i_extra_isize;
879  
880  	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
881  	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
882  	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
883  
884  	ret = -ECANCELED;
885  	dst = ext4_fc_reserve_space(inode->i_sb,
886  		EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
887  	if (!dst)
888  		goto err;
889  
890  	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
891  	dst += EXT4_FC_TAG_BASE_LEN;
892  	memcpy(dst, &fc_inode, sizeof(fc_inode));
893  	dst += sizeof(fc_inode);
894  	memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
895  	ret = 0;
896  err:
897  	brelse(iloc.bh);
898  	return ret;
899  }
900  
901  /*
902   * Writes updated data ranges for the inode in question. Updates CRC.
903   * Returns 0 on success, error otherwise.
904   */
ext4_fc_write_inode_data(struct inode * inode,u32 * crc)905  static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
906  {
907  	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
908  	struct ext4_inode_info *ei = EXT4_I(inode);
909  	struct ext4_map_blocks map;
910  	struct ext4_fc_add_range fc_ext;
911  	struct ext4_fc_del_range lrange;
912  	struct ext4_extent *ex;
913  	int ret;
914  
915  	mutex_lock(&ei->i_fc_lock);
916  	if (ei->i_fc_lblk_len == 0) {
917  		mutex_unlock(&ei->i_fc_lock);
918  		return 0;
919  	}
920  	old_blk_size = ei->i_fc_lblk_start;
921  	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
922  	ei->i_fc_lblk_len = 0;
923  	mutex_unlock(&ei->i_fc_lock);
924  
925  	cur_lblk_off = old_blk_size;
926  	ext4_debug("will try writing %d to %d for inode %ld\n",
927  		   cur_lblk_off, new_blk_size, inode->i_ino);
928  
929  	while (cur_lblk_off <= new_blk_size) {
930  		map.m_lblk = cur_lblk_off;
931  		map.m_len = new_blk_size - cur_lblk_off + 1;
932  		ret = ext4_map_blocks(NULL, inode, &map, 0);
933  		if (ret < 0)
934  			return -ECANCELED;
935  
936  		if (map.m_len == 0) {
937  			cur_lblk_off++;
938  			continue;
939  		}
940  
941  		if (ret == 0) {
942  			lrange.fc_ino = cpu_to_le32(inode->i_ino);
943  			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
944  			lrange.fc_len = cpu_to_le32(map.m_len);
945  			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
946  					    sizeof(lrange), (u8 *)&lrange, crc))
947  				return -ENOSPC;
948  		} else {
949  			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
950  				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
951  
952  			/* Limit the number of blocks in one extent */
953  			map.m_len = min(max, map.m_len);
954  
955  			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
956  			ex = (struct ext4_extent *)&fc_ext.fc_ex;
957  			ex->ee_block = cpu_to_le32(map.m_lblk);
958  			ex->ee_len = cpu_to_le16(map.m_len);
959  			ext4_ext_store_pblock(ex, map.m_pblk);
960  			if (map.m_flags & EXT4_MAP_UNWRITTEN)
961  				ext4_ext_mark_unwritten(ex);
962  			else
963  				ext4_ext_mark_initialized(ex);
964  			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
965  					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
966  				return -ENOSPC;
967  		}
968  
969  		cur_lblk_off += map.m_len;
970  	}
971  
972  	return 0;
973  }
974  
975  
976  /* Submit data for all the fast commit inodes */
ext4_fc_submit_inode_data_all(journal_t * journal)977  static int ext4_fc_submit_inode_data_all(journal_t *journal)
978  {
979  	struct super_block *sb = journal->j_private;
980  	struct ext4_sb_info *sbi = EXT4_SB(sb);
981  	struct ext4_inode_info *ei;
982  	int ret = 0;
983  
984  	spin_lock(&sbi->s_fc_lock);
985  	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
986  		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
987  		while (atomic_read(&ei->i_fc_updates)) {
988  			DEFINE_WAIT(wait);
989  
990  			prepare_to_wait(&ei->i_fc_wait, &wait,
991  						TASK_UNINTERRUPTIBLE);
992  			if (atomic_read(&ei->i_fc_updates)) {
993  				spin_unlock(&sbi->s_fc_lock);
994  				schedule();
995  				spin_lock(&sbi->s_fc_lock);
996  			}
997  			finish_wait(&ei->i_fc_wait, &wait);
998  		}
999  		spin_unlock(&sbi->s_fc_lock);
1000  		ret = jbd2_submit_inode_data(journal, ei->jinode);
1001  		if (ret)
1002  			return ret;
1003  		spin_lock(&sbi->s_fc_lock);
1004  	}
1005  	spin_unlock(&sbi->s_fc_lock);
1006  
1007  	return ret;
1008  }
1009  
1010  /* Wait for completion of data for all the fast commit inodes */
ext4_fc_wait_inode_data_all(journal_t * journal)1011  static int ext4_fc_wait_inode_data_all(journal_t *journal)
1012  {
1013  	struct super_block *sb = journal->j_private;
1014  	struct ext4_sb_info *sbi = EXT4_SB(sb);
1015  	struct ext4_inode_info *pos, *n;
1016  	int ret = 0;
1017  
1018  	spin_lock(&sbi->s_fc_lock);
1019  	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1020  		if (!ext4_test_inode_state(&pos->vfs_inode,
1021  					   EXT4_STATE_FC_COMMITTING))
1022  			continue;
1023  		spin_unlock(&sbi->s_fc_lock);
1024  
1025  		ret = jbd2_wait_inode_data(journal, pos->jinode);
1026  		if (ret)
1027  			return ret;
1028  		spin_lock(&sbi->s_fc_lock);
1029  	}
1030  	spin_unlock(&sbi->s_fc_lock);
1031  
1032  	return 0;
1033  }
1034  
1035  /* Commit all the directory entry updates */
ext4_fc_commit_dentry_updates(journal_t * journal,u32 * crc)1036  static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1037  __acquires(&sbi->s_fc_lock)
1038  __releases(&sbi->s_fc_lock)
1039  {
1040  	struct super_block *sb = journal->j_private;
1041  	struct ext4_sb_info *sbi = EXT4_SB(sb);
1042  	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1043  	struct inode *inode;
1044  	struct ext4_inode_info *ei;
1045  	int ret;
1046  
1047  	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1048  		return 0;
1049  	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1050  				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1051  		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1052  			spin_unlock(&sbi->s_fc_lock);
1053  			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1054  				ret = -ENOSPC;
1055  				goto lock_and_exit;
1056  			}
1057  			spin_lock(&sbi->s_fc_lock);
1058  			continue;
1059  		}
1060  		/*
1061  		 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1062  		 * corresponding inode pointer
1063  		 */
1064  		WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1065  		ei = list_first_entry(&fc_dentry->fcd_dilist,
1066  				struct ext4_inode_info, i_fc_dilist);
1067  		inode = &ei->vfs_inode;
1068  		WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1069  
1070  		spin_unlock(&sbi->s_fc_lock);
1071  
1072  		/*
1073  		 * We first write the inode and then the create dirent. This
1074  		 * allows the recovery code to create an unnamed inode first
1075  		 * and then link it to a directory entry. This allows us
1076  		 * to use namei.c routines almost as is and simplifies
1077  		 * the recovery code.
1078  		 */
1079  		ret = ext4_fc_write_inode(inode, crc);
1080  		if (ret)
1081  			goto lock_and_exit;
1082  
1083  		ret = ext4_fc_write_inode_data(inode, crc);
1084  		if (ret)
1085  			goto lock_and_exit;
1086  
1087  		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1088  			ret = -ENOSPC;
1089  			goto lock_and_exit;
1090  		}
1091  
1092  		spin_lock(&sbi->s_fc_lock);
1093  	}
1094  	return 0;
1095  lock_and_exit:
1096  	spin_lock(&sbi->s_fc_lock);
1097  	return ret;
1098  }
1099  
ext4_fc_perform_commit(journal_t * journal)1100  static int ext4_fc_perform_commit(journal_t *journal)
1101  {
1102  	struct super_block *sb = journal->j_private;
1103  	struct ext4_sb_info *sbi = EXT4_SB(sb);
1104  	struct ext4_inode_info *iter;
1105  	struct ext4_fc_head head;
1106  	struct inode *inode;
1107  	struct blk_plug plug;
1108  	int ret = 0;
1109  	u32 crc = 0;
1110  
1111  	ret = ext4_fc_submit_inode_data_all(journal);
1112  	if (ret)
1113  		return ret;
1114  
1115  	ret = ext4_fc_wait_inode_data_all(journal);
1116  	if (ret)
1117  		return ret;
1118  
1119  	/*
1120  	 * If file system device is different from journal device, issue a cache
1121  	 * flush before we start writing fast commit blocks.
1122  	 */
1123  	if (journal->j_fs_dev != journal->j_dev)
1124  		blkdev_issue_flush(journal->j_fs_dev);
1125  
1126  	blk_start_plug(&plug);
1127  	if (sbi->s_fc_bytes == 0) {
1128  		/*
1129  		 * Add a head tag only if this is the first fast commit
1130  		 * in this TID.
1131  		 */
1132  		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1133  		head.fc_tid = cpu_to_le32(
1134  			sbi->s_journal->j_running_transaction->t_tid);
1135  		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1136  			(u8 *)&head, &crc)) {
1137  			ret = -ENOSPC;
1138  			goto out;
1139  		}
1140  	}
1141  
1142  	spin_lock(&sbi->s_fc_lock);
1143  	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1144  	if (ret) {
1145  		spin_unlock(&sbi->s_fc_lock);
1146  		goto out;
1147  	}
1148  
1149  	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1150  		inode = &iter->vfs_inode;
1151  		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1152  			continue;
1153  
1154  		spin_unlock(&sbi->s_fc_lock);
1155  		ret = ext4_fc_write_inode_data(inode, &crc);
1156  		if (ret)
1157  			goto out;
1158  		ret = ext4_fc_write_inode(inode, &crc);
1159  		if (ret)
1160  			goto out;
1161  		spin_lock(&sbi->s_fc_lock);
1162  	}
1163  	spin_unlock(&sbi->s_fc_lock);
1164  
1165  	ret = ext4_fc_write_tail(sb, crc);
1166  
1167  out:
1168  	blk_finish_plug(&plug);
1169  	return ret;
1170  }
1171  
ext4_fc_update_stats(struct super_block * sb,int status,u64 commit_time,int nblks,tid_t commit_tid)1172  static void ext4_fc_update_stats(struct super_block *sb, int status,
1173  				 u64 commit_time, int nblks, tid_t commit_tid)
1174  {
1175  	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1176  
1177  	ext4_debug("Fast commit ended with status = %d for tid %u",
1178  			status, commit_tid);
1179  	if (status == EXT4_FC_STATUS_OK) {
1180  		stats->fc_num_commits++;
1181  		stats->fc_numblks += nblks;
1182  		if (likely(stats->s_fc_avg_commit_time))
1183  			stats->s_fc_avg_commit_time =
1184  				(commit_time +
1185  				 stats->s_fc_avg_commit_time * 3) / 4;
1186  		else
1187  			stats->s_fc_avg_commit_time = commit_time;
1188  	} else if (status == EXT4_FC_STATUS_FAILED ||
1189  		   status == EXT4_FC_STATUS_INELIGIBLE) {
1190  		if (status == EXT4_FC_STATUS_FAILED)
1191  			stats->fc_failed_commits++;
1192  		stats->fc_ineligible_commits++;
1193  	} else {
1194  		stats->fc_skipped_commits++;
1195  	}
1196  	trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1197  }
1198  
1199  /*
1200   * The main commit entry point. Performs a fast commit for transaction
1201   * commit_tid if needed. If it's not possible to perform a fast commit
1202   * due to various reasons, we fall back to full commit. Returns 0
1203   * on success, error otherwise.
1204   */
ext4_fc_commit(journal_t * journal,tid_t commit_tid)1205  int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1206  {
1207  	struct super_block *sb = journal->j_private;
1208  	struct ext4_sb_info *sbi = EXT4_SB(sb);
1209  	int nblks = 0, ret, bsize = journal->j_blocksize;
1210  	int subtid = atomic_read(&sbi->s_fc_subtid);
1211  	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1212  	ktime_t start_time, commit_time;
1213  
1214  	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1215  		return jbd2_complete_transaction(journal, commit_tid);
1216  
1217  	trace_ext4_fc_commit_start(sb, commit_tid);
1218  
1219  	start_time = ktime_get();
1220  
1221  restart_fc:
1222  	ret = jbd2_fc_begin_commit(journal, commit_tid);
1223  	if (ret == -EALREADY) {
1224  		/* There was an ongoing commit, check if we need to restart */
1225  		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1226  		    tid_gt(commit_tid, journal->j_commit_sequence))
1227  			goto restart_fc;
1228  		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1229  				commit_tid);
1230  		return 0;
1231  	} else if (ret) {
1232  		/*
1233  		 * Commit couldn't start. Just update stats and perform a
1234  		 * full commit.
1235  		 */
1236  		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1237  				commit_tid);
1238  		return jbd2_complete_transaction(journal, commit_tid);
1239  	}
1240  
1241  	/*
1242  	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1243  	 * if we are fast commit ineligible.
1244  	 */
1245  	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1246  		status = EXT4_FC_STATUS_INELIGIBLE;
1247  		goto fallback;
1248  	}
1249  
1250  	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1251  	ret = ext4_fc_perform_commit(journal);
1252  	if (ret < 0) {
1253  		status = EXT4_FC_STATUS_FAILED;
1254  		goto fallback;
1255  	}
1256  	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1257  	ret = jbd2_fc_wait_bufs(journal, nblks);
1258  	if (ret < 0) {
1259  		status = EXT4_FC_STATUS_FAILED;
1260  		goto fallback;
1261  	}
1262  	atomic_inc(&sbi->s_fc_subtid);
1263  	ret = jbd2_fc_end_commit(journal);
1264  	/*
1265  	 * weight the commit time higher than the average time so we
1266  	 * don't react too strongly to vast changes in the commit time
1267  	 */
1268  	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1269  	ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1270  	return ret;
1271  
1272  fallback:
1273  	ret = jbd2_fc_end_commit_fallback(journal);
1274  	ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1275  	return ret;
1276  }
1277  
1278  /*
1279   * Fast commit cleanup routine. This is called after every fast commit and
1280   * full commit. full is true if we are called after a full commit.
1281   */
ext4_fc_cleanup(journal_t * journal,int full,tid_t tid)1282  static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1283  {
1284  	struct super_block *sb = journal->j_private;
1285  	struct ext4_sb_info *sbi = EXT4_SB(sb);
1286  	struct ext4_inode_info *iter, *iter_n;
1287  	struct ext4_fc_dentry_update *fc_dentry;
1288  
1289  	if (full && sbi->s_fc_bh)
1290  		sbi->s_fc_bh = NULL;
1291  
1292  	trace_ext4_fc_cleanup(journal, full, tid);
1293  	jbd2_fc_release_bufs(journal);
1294  
1295  	spin_lock(&sbi->s_fc_lock);
1296  	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1297  				 i_fc_list) {
1298  		list_del_init(&iter->i_fc_list);
1299  		ext4_clear_inode_state(&iter->vfs_inode,
1300  				       EXT4_STATE_FC_COMMITTING);
1301  		if (tid_geq(tid, iter->i_sync_tid)) {
1302  			ext4_fc_reset_inode(&iter->vfs_inode);
1303  		} else if (full) {
1304  			/*
1305  			 * We are called after a full commit, inode has been
1306  			 * modified while the commit was running. Re-enqueue
1307  			 * the inode into STAGING, which will then be splice
1308  			 * back into MAIN. This cannot happen during
1309  			 * fastcommit because the journal is locked all the
1310  			 * time in that case (and tid doesn't increase so
1311  			 * tid check above isn't reliable).
1312  			 */
1313  			list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list,
1314  				      &sbi->s_fc_q[FC_Q_STAGING]);
1315  		}
1316  		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1317  		smp_mb();
1318  #if (BITS_PER_LONG < 64)
1319  		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1320  #else
1321  		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1322  #endif
1323  	}
1324  
1325  	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1326  		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1327  					     struct ext4_fc_dentry_update,
1328  					     fcd_list);
1329  		list_del_init(&fc_dentry->fcd_list);
1330  		list_del_init(&fc_dentry->fcd_dilist);
1331  		spin_unlock(&sbi->s_fc_lock);
1332  
1333  		if (fc_dentry->fcd_name.name &&
1334  			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1335  			kfree(fc_dentry->fcd_name.name);
1336  		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1337  		spin_lock(&sbi->s_fc_lock);
1338  	}
1339  
1340  	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1341  				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1342  	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1343  				&sbi->s_fc_q[FC_Q_MAIN]);
1344  
1345  	if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
1346  		sbi->s_fc_ineligible_tid = 0;
1347  		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1348  	}
1349  
1350  	if (full)
1351  		sbi->s_fc_bytes = 0;
1352  	spin_unlock(&sbi->s_fc_lock);
1353  	trace_ext4_fc_stats(sb);
1354  }
1355  
1356  /* Ext4 Replay Path Routines */
1357  
1358  /* Helper struct for dentry replay routines */
1359  struct dentry_info_args {
1360  	int parent_ino, dname_len, ino, inode_len;
1361  	char *dname;
1362  };
1363  
1364  /* Same as struct ext4_fc_tl, but uses native endianness fields */
1365  struct ext4_fc_tl_mem {
1366  	u16 fc_tag;
1367  	u16 fc_len;
1368  };
1369  
tl_to_darg(struct dentry_info_args * darg,struct ext4_fc_tl_mem * tl,u8 * val)1370  static inline void tl_to_darg(struct dentry_info_args *darg,
1371  			      struct ext4_fc_tl_mem *tl, u8 *val)
1372  {
1373  	struct ext4_fc_dentry_info fcd;
1374  
1375  	memcpy(&fcd, val, sizeof(fcd));
1376  
1377  	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1378  	darg->ino = le32_to_cpu(fcd.fc_ino);
1379  	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1380  	darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
1381  }
1382  
ext4_fc_get_tl(struct ext4_fc_tl_mem * tl,u8 * val)1383  static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
1384  {
1385  	struct ext4_fc_tl tl_disk;
1386  
1387  	memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
1388  	tl->fc_len = le16_to_cpu(tl_disk.fc_len);
1389  	tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
1390  }
1391  
1392  /* Unlink replay function */
ext4_fc_replay_unlink(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1393  static int ext4_fc_replay_unlink(struct super_block *sb,
1394  				 struct ext4_fc_tl_mem *tl, u8 *val)
1395  {
1396  	struct inode *inode, *old_parent;
1397  	struct qstr entry;
1398  	struct dentry_info_args darg;
1399  	int ret = 0;
1400  
1401  	tl_to_darg(&darg, tl, val);
1402  
1403  	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1404  			darg.parent_ino, darg.dname_len);
1405  
1406  	entry.name = darg.dname;
1407  	entry.len = darg.dname_len;
1408  	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1409  
1410  	if (IS_ERR(inode)) {
1411  		ext4_debug("Inode %d not found", darg.ino);
1412  		return 0;
1413  	}
1414  
1415  	old_parent = ext4_iget(sb, darg.parent_ino,
1416  				EXT4_IGET_NORMAL);
1417  	if (IS_ERR(old_parent)) {
1418  		ext4_debug("Dir with inode %d not found", darg.parent_ino);
1419  		iput(inode);
1420  		return 0;
1421  	}
1422  
1423  	ret = __ext4_unlink(old_parent, &entry, inode, NULL);
1424  	/* -ENOENT ok coz it might not exist anymore. */
1425  	if (ret == -ENOENT)
1426  		ret = 0;
1427  	iput(old_parent);
1428  	iput(inode);
1429  	return ret;
1430  }
1431  
ext4_fc_replay_link_internal(struct super_block * sb,struct dentry_info_args * darg,struct inode * inode)1432  static int ext4_fc_replay_link_internal(struct super_block *sb,
1433  				struct dentry_info_args *darg,
1434  				struct inode *inode)
1435  {
1436  	struct inode *dir = NULL;
1437  	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1438  	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1439  	int ret = 0;
1440  
1441  	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1442  	if (IS_ERR(dir)) {
1443  		ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1444  		dir = NULL;
1445  		goto out;
1446  	}
1447  
1448  	dentry_dir = d_obtain_alias(dir);
1449  	if (IS_ERR(dentry_dir)) {
1450  		ext4_debug("Failed to obtain dentry");
1451  		dentry_dir = NULL;
1452  		goto out;
1453  	}
1454  
1455  	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1456  	if (!dentry_inode) {
1457  		ext4_debug("Inode dentry not created.");
1458  		ret = -ENOMEM;
1459  		goto out;
1460  	}
1461  
1462  	ret = __ext4_link(dir, inode, dentry_inode);
1463  	/*
1464  	 * It's possible that link already existed since data blocks
1465  	 * for the dir in question got persisted before we crashed OR
1466  	 * we replayed this tag and crashed before the entire replay
1467  	 * could complete.
1468  	 */
1469  	if (ret && ret != -EEXIST) {
1470  		ext4_debug("Failed to link\n");
1471  		goto out;
1472  	}
1473  
1474  	ret = 0;
1475  out:
1476  	if (dentry_dir) {
1477  		d_drop(dentry_dir);
1478  		dput(dentry_dir);
1479  	} else if (dir) {
1480  		iput(dir);
1481  	}
1482  	if (dentry_inode) {
1483  		d_drop(dentry_inode);
1484  		dput(dentry_inode);
1485  	}
1486  
1487  	return ret;
1488  }
1489  
1490  /* Link replay function */
ext4_fc_replay_link(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1491  static int ext4_fc_replay_link(struct super_block *sb,
1492  			       struct ext4_fc_tl_mem *tl, u8 *val)
1493  {
1494  	struct inode *inode;
1495  	struct dentry_info_args darg;
1496  	int ret = 0;
1497  
1498  	tl_to_darg(&darg, tl, val);
1499  	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1500  			darg.parent_ino, darg.dname_len);
1501  
1502  	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1503  	if (IS_ERR(inode)) {
1504  		ext4_debug("Inode not found.");
1505  		return 0;
1506  	}
1507  
1508  	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1509  	iput(inode);
1510  	return ret;
1511  }
1512  
1513  /*
1514   * Record all the modified inodes during replay. We use this later to setup
1515   * block bitmaps correctly.
1516   */
ext4_fc_record_modified_inode(struct super_block * sb,int ino)1517  static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1518  {
1519  	struct ext4_fc_replay_state *state;
1520  	int i;
1521  
1522  	state = &EXT4_SB(sb)->s_fc_replay_state;
1523  	for (i = 0; i < state->fc_modified_inodes_used; i++)
1524  		if (state->fc_modified_inodes[i] == ino)
1525  			return 0;
1526  	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1527  		int *fc_modified_inodes;
1528  
1529  		fc_modified_inodes = krealloc(state->fc_modified_inodes,
1530  				sizeof(int) * (state->fc_modified_inodes_size +
1531  				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1532  				GFP_KERNEL);
1533  		if (!fc_modified_inodes)
1534  			return -ENOMEM;
1535  		state->fc_modified_inodes = fc_modified_inodes;
1536  		state->fc_modified_inodes_size +=
1537  			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1538  	}
1539  	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1540  	return 0;
1541  }
1542  
1543  /*
1544   * Inode replay function
1545   */
ext4_fc_replay_inode(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1546  static int ext4_fc_replay_inode(struct super_block *sb,
1547  				struct ext4_fc_tl_mem *tl, u8 *val)
1548  {
1549  	struct ext4_fc_inode fc_inode;
1550  	struct ext4_inode *raw_inode;
1551  	struct ext4_inode *raw_fc_inode;
1552  	struct inode *inode = NULL;
1553  	struct ext4_iloc iloc;
1554  	int inode_len, ino, ret, tag = tl->fc_tag;
1555  	struct ext4_extent_header *eh;
1556  	size_t off_gen = offsetof(struct ext4_inode, i_generation);
1557  
1558  	memcpy(&fc_inode, val, sizeof(fc_inode));
1559  
1560  	ino = le32_to_cpu(fc_inode.fc_ino);
1561  	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1562  
1563  	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1564  	if (!IS_ERR(inode)) {
1565  		ext4_ext_clear_bb(inode);
1566  		iput(inode);
1567  	}
1568  	inode = NULL;
1569  
1570  	ret = ext4_fc_record_modified_inode(sb, ino);
1571  	if (ret)
1572  		goto out;
1573  
1574  	raw_fc_inode = (struct ext4_inode *)
1575  		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1576  	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1577  	if (ret)
1578  		goto out;
1579  
1580  	inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
1581  	raw_inode = ext4_raw_inode(&iloc);
1582  
1583  	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1584  	memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
1585  	       inode_len - off_gen);
1586  	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1587  		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1588  		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1589  			memset(eh, 0, sizeof(*eh));
1590  			eh->eh_magic = EXT4_EXT_MAGIC;
1591  			eh->eh_max = cpu_to_le16(
1592  				(sizeof(raw_inode->i_block) -
1593  				 sizeof(struct ext4_extent_header))
1594  				 / sizeof(struct ext4_extent));
1595  		}
1596  	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1597  		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1598  			sizeof(raw_inode->i_block));
1599  	}
1600  
1601  	/* Immediately update the inode on disk. */
1602  	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1603  	if (ret)
1604  		goto out;
1605  	ret = sync_dirty_buffer(iloc.bh);
1606  	if (ret)
1607  		goto out;
1608  	ret = ext4_mark_inode_used(sb, ino);
1609  	if (ret)
1610  		goto out;
1611  
1612  	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1613  	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1614  	if (IS_ERR(inode)) {
1615  		ext4_debug("Inode not found.");
1616  		return -EFSCORRUPTED;
1617  	}
1618  
1619  	/*
1620  	 * Our allocator could have made different decisions than before
1621  	 * crashing. This should be fixed but until then, we calculate
1622  	 * the number of blocks the inode.
1623  	 */
1624  	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1625  		ext4_ext_replay_set_iblocks(inode);
1626  
1627  	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1628  	ext4_reset_inode_seed(inode);
1629  
1630  	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1631  	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1632  	sync_dirty_buffer(iloc.bh);
1633  	brelse(iloc.bh);
1634  out:
1635  	iput(inode);
1636  	if (!ret)
1637  		blkdev_issue_flush(sb->s_bdev);
1638  
1639  	return 0;
1640  }
1641  
1642  /*
1643   * Dentry create replay function.
1644   *
1645   * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1646   * inode for which we are trying to create a dentry here, should already have
1647   * been replayed before we start here.
1648   */
ext4_fc_replay_create(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1649  static int ext4_fc_replay_create(struct super_block *sb,
1650  				 struct ext4_fc_tl_mem *tl, u8 *val)
1651  {
1652  	int ret = 0;
1653  	struct inode *inode = NULL;
1654  	struct inode *dir = NULL;
1655  	struct dentry_info_args darg;
1656  
1657  	tl_to_darg(&darg, tl, val);
1658  
1659  	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1660  			darg.parent_ino, darg.dname_len);
1661  
1662  	/* This takes care of update group descriptor and other metadata */
1663  	ret = ext4_mark_inode_used(sb, darg.ino);
1664  	if (ret)
1665  		goto out;
1666  
1667  	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1668  	if (IS_ERR(inode)) {
1669  		ext4_debug("inode %d not found.", darg.ino);
1670  		inode = NULL;
1671  		ret = -EINVAL;
1672  		goto out;
1673  	}
1674  
1675  	if (S_ISDIR(inode->i_mode)) {
1676  		/*
1677  		 * If we are creating a directory, we need to make sure that the
1678  		 * dot and dot dot dirents are setup properly.
1679  		 */
1680  		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1681  		if (IS_ERR(dir)) {
1682  			ext4_debug("Dir %d not found.", darg.ino);
1683  			goto out;
1684  		}
1685  		ret = ext4_init_new_dir(NULL, dir, inode);
1686  		iput(dir);
1687  		if (ret) {
1688  			ret = 0;
1689  			goto out;
1690  		}
1691  	}
1692  	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1693  	if (ret)
1694  		goto out;
1695  	set_nlink(inode, 1);
1696  	ext4_mark_inode_dirty(NULL, inode);
1697  out:
1698  	iput(inode);
1699  	return ret;
1700  }
1701  
1702  /*
1703   * Record physical disk regions which are in use as per fast commit area,
1704   * and used by inodes during replay phase. Our simple replay phase
1705   * allocator excludes these regions from allocation.
1706   */
ext4_fc_record_regions(struct super_block * sb,int ino,ext4_lblk_t lblk,ext4_fsblk_t pblk,int len,int replay)1707  int ext4_fc_record_regions(struct super_block *sb, int ino,
1708  		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1709  {
1710  	struct ext4_fc_replay_state *state;
1711  	struct ext4_fc_alloc_region *region;
1712  
1713  	state = &EXT4_SB(sb)->s_fc_replay_state;
1714  	/*
1715  	 * during replay phase, the fc_regions_valid may not same as
1716  	 * fc_regions_used, update it when do new additions.
1717  	 */
1718  	if (replay && state->fc_regions_used != state->fc_regions_valid)
1719  		state->fc_regions_used = state->fc_regions_valid;
1720  	if (state->fc_regions_used == state->fc_regions_size) {
1721  		struct ext4_fc_alloc_region *fc_regions;
1722  
1723  		fc_regions = krealloc(state->fc_regions,
1724  				      sizeof(struct ext4_fc_alloc_region) *
1725  				      (state->fc_regions_size +
1726  				       EXT4_FC_REPLAY_REALLOC_INCREMENT),
1727  				      GFP_KERNEL);
1728  		if (!fc_regions)
1729  			return -ENOMEM;
1730  		state->fc_regions_size +=
1731  			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1732  		state->fc_regions = fc_regions;
1733  	}
1734  	region = &state->fc_regions[state->fc_regions_used++];
1735  	region->ino = ino;
1736  	region->lblk = lblk;
1737  	region->pblk = pblk;
1738  	region->len = len;
1739  
1740  	if (replay)
1741  		state->fc_regions_valid++;
1742  
1743  	return 0;
1744  }
1745  
1746  /* Replay add range tag */
ext4_fc_replay_add_range(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1747  static int ext4_fc_replay_add_range(struct super_block *sb,
1748  				    struct ext4_fc_tl_mem *tl, u8 *val)
1749  {
1750  	struct ext4_fc_add_range fc_add_ex;
1751  	struct ext4_extent newex, *ex;
1752  	struct inode *inode;
1753  	ext4_lblk_t start, cur;
1754  	int remaining, len;
1755  	ext4_fsblk_t start_pblk;
1756  	struct ext4_map_blocks map;
1757  	struct ext4_ext_path *path = NULL;
1758  	int ret;
1759  
1760  	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1761  	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1762  
1763  	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1764  		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1765  		ext4_ext_get_actual_len(ex));
1766  
1767  	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1768  	if (IS_ERR(inode)) {
1769  		ext4_debug("Inode not found.");
1770  		return 0;
1771  	}
1772  
1773  	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1774  	if (ret)
1775  		goto out;
1776  
1777  	start = le32_to_cpu(ex->ee_block);
1778  	start_pblk = ext4_ext_pblock(ex);
1779  	len = ext4_ext_get_actual_len(ex);
1780  
1781  	cur = start;
1782  	remaining = len;
1783  	ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1784  		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1785  		  inode->i_ino);
1786  
1787  	while (remaining > 0) {
1788  		map.m_lblk = cur;
1789  		map.m_len = remaining;
1790  		map.m_pblk = 0;
1791  		ret = ext4_map_blocks(NULL, inode, &map, 0);
1792  
1793  		if (ret < 0)
1794  			goto out;
1795  
1796  		if (ret == 0) {
1797  			/* Range is not mapped */
1798  			path = ext4_find_extent(inode, cur, path, 0);
1799  			if (IS_ERR(path))
1800  				goto out;
1801  			memset(&newex, 0, sizeof(newex));
1802  			newex.ee_block = cpu_to_le32(cur);
1803  			ext4_ext_store_pblock(
1804  				&newex, start_pblk + cur - start);
1805  			newex.ee_len = cpu_to_le16(map.m_len);
1806  			if (ext4_ext_is_unwritten(ex))
1807  				ext4_ext_mark_unwritten(&newex);
1808  			down_write(&EXT4_I(inode)->i_data_sem);
1809  			path = ext4_ext_insert_extent(NULL, inode,
1810  						      path, &newex, 0);
1811  			up_write((&EXT4_I(inode)->i_data_sem));
1812  			if (IS_ERR(path))
1813  				goto out;
1814  			goto next;
1815  		}
1816  
1817  		if (start_pblk + cur - start != map.m_pblk) {
1818  			/*
1819  			 * Logical to physical mapping changed. This can happen
1820  			 * if this range was removed and then reallocated to
1821  			 * map to new physical blocks during a fast commit.
1822  			 */
1823  			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1824  					ext4_ext_is_unwritten(ex),
1825  					start_pblk + cur - start);
1826  			if (ret)
1827  				goto out;
1828  			/*
1829  			 * Mark the old blocks as free since they aren't used
1830  			 * anymore. We maintain an array of all the modified
1831  			 * inodes. In case these blocks are still used at either
1832  			 * a different logical range in the same inode or in
1833  			 * some different inode, we will mark them as allocated
1834  			 * at the end of the FC replay using our array of
1835  			 * modified inodes.
1836  			 */
1837  			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1838  			goto next;
1839  		}
1840  
1841  		/* Range is mapped and needs a state change */
1842  		ext4_debug("Converting from %ld to %d %lld",
1843  				map.m_flags & EXT4_MAP_UNWRITTEN,
1844  			ext4_ext_is_unwritten(ex), map.m_pblk);
1845  		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1846  					ext4_ext_is_unwritten(ex), map.m_pblk);
1847  		if (ret)
1848  			goto out;
1849  		/*
1850  		 * We may have split the extent tree while toggling the state.
1851  		 * Try to shrink the extent tree now.
1852  		 */
1853  		ext4_ext_replay_shrink_inode(inode, start + len);
1854  next:
1855  		cur += map.m_len;
1856  		remaining -= map.m_len;
1857  	}
1858  	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1859  					sb->s_blocksize_bits);
1860  out:
1861  	ext4_free_ext_path(path);
1862  	iput(inode);
1863  	return 0;
1864  }
1865  
1866  /* Replay DEL_RANGE tag */
1867  static int
ext4_fc_replay_del_range(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1868  ext4_fc_replay_del_range(struct super_block *sb,
1869  			 struct ext4_fc_tl_mem *tl, u8 *val)
1870  {
1871  	struct inode *inode;
1872  	struct ext4_fc_del_range lrange;
1873  	struct ext4_map_blocks map;
1874  	ext4_lblk_t cur, remaining;
1875  	int ret;
1876  
1877  	memcpy(&lrange, val, sizeof(lrange));
1878  	cur = le32_to_cpu(lrange.fc_lblk);
1879  	remaining = le32_to_cpu(lrange.fc_len);
1880  
1881  	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1882  		le32_to_cpu(lrange.fc_ino), cur, remaining);
1883  
1884  	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1885  	if (IS_ERR(inode)) {
1886  		ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1887  		return 0;
1888  	}
1889  
1890  	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1891  	if (ret)
1892  		goto out;
1893  
1894  	ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1895  			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1896  			le32_to_cpu(lrange.fc_len));
1897  	while (remaining > 0) {
1898  		map.m_lblk = cur;
1899  		map.m_len = remaining;
1900  
1901  		ret = ext4_map_blocks(NULL, inode, &map, 0);
1902  		if (ret < 0)
1903  			goto out;
1904  		if (ret > 0) {
1905  			remaining -= ret;
1906  			cur += ret;
1907  			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1908  		} else {
1909  			remaining -= map.m_len;
1910  			cur += map.m_len;
1911  		}
1912  	}
1913  
1914  	down_write(&EXT4_I(inode)->i_data_sem);
1915  	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1916  				le32_to_cpu(lrange.fc_lblk) +
1917  				le32_to_cpu(lrange.fc_len) - 1);
1918  	up_write(&EXT4_I(inode)->i_data_sem);
1919  	if (ret)
1920  		goto out;
1921  	ext4_ext_replay_shrink_inode(inode,
1922  		i_size_read(inode) >> sb->s_blocksize_bits);
1923  	ext4_mark_inode_dirty(NULL, inode);
1924  out:
1925  	iput(inode);
1926  	return 0;
1927  }
1928  
ext4_fc_set_bitmaps_and_counters(struct super_block * sb)1929  static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1930  {
1931  	struct ext4_fc_replay_state *state;
1932  	struct inode *inode;
1933  	struct ext4_ext_path *path = NULL;
1934  	struct ext4_map_blocks map;
1935  	int i, ret, j;
1936  	ext4_lblk_t cur, end;
1937  
1938  	state = &EXT4_SB(sb)->s_fc_replay_state;
1939  	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1940  		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1941  			EXT4_IGET_NORMAL);
1942  		if (IS_ERR(inode)) {
1943  			ext4_debug("Inode %d not found.",
1944  				state->fc_modified_inodes[i]);
1945  			continue;
1946  		}
1947  		cur = 0;
1948  		end = EXT_MAX_BLOCKS;
1949  		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1950  			iput(inode);
1951  			continue;
1952  		}
1953  		while (cur < end) {
1954  			map.m_lblk = cur;
1955  			map.m_len = end - cur;
1956  
1957  			ret = ext4_map_blocks(NULL, inode, &map, 0);
1958  			if (ret < 0)
1959  				break;
1960  
1961  			if (ret > 0) {
1962  				path = ext4_find_extent(inode, map.m_lblk, path, 0);
1963  				if (!IS_ERR(path)) {
1964  					for (j = 0; j < path->p_depth; j++)
1965  						ext4_mb_mark_bb(inode->i_sb,
1966  							path[j].p_block, 1, true);
1967  				} else {
1968  					path = NULL;
1969  				}
1970  				cur += ret;
1971  				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1972  							map.m_len, true);
1973  			} else {
1974  				cur = cur + (map.m_len ? map.m_len : 1);
1975  			}
1976  		}
1977  		iput(inode);
1978  	}
1979  
1980  	ext4_free_ext_path(path);
1981  }
1982  
1983  /*
1984   * Check if block is in excluded regions for block allocation. The simple
1985   * allocator that runs during replay phase is calls this function to see
1986   * if it is okay to use a block.
1987   */
ext4_fc_replay_check_excluded(struct super_block * sb,ext4_fsblk_t blk)1988  bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1989  {
1990  	int i;
1991  	struct ext4_fc_replay_state *state;
1992  
1993  	state = &EXT4_SB(sb)->s_fc_replay_state;
1994  	for (i = 0; i < state->fc_regions_valid; i++) {
1995  		if (state->fc_regions[i].ino == 0 ||
1996  			state->fc_regions[i].len == 0)
1997  			continue;
1998  		if (in_range(blk, state->fc_regions[i].pblk,
1999  					state->fc_regions[i].len))
2000  			return true;
2001  	}
2002  	return false;
2003  }
2004  
2005  /* Cleanup function called after replay */
ext4_fc_replay_cleanup(struct super_block * sb)2006  void ext4_fc_replay_cleanup(struct super_block *sb)
2007  {
2008  	struct ext4_sb_info *sbi = EXT4_SB(sb);
2009  
2010  	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
2011  	kfree(sbi->s_fc_replay_state.fc_regions);
2012  	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
2013  }
2014  
ext4_fc_value_len_isvalid(struct ext4_sb_info * sbi,int tag,int len)2015  static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
2016  				      int tag, int len)
2017  {
2018  	switch (tag) {
2019  	case EXT4_FC_TAG_ADD_RANGE:
2020  		return len == sizeof(struct ext4_fc_add_range);
2021  	case EXT4_FC_TAG_DEL_RANGE:
2022  		return len == sizeof(struct ext4_fc_del_range);
2023  	case EXT4_FC_TAG_CREAT:
2024  	case EXT4_FC_TAG_LINK:
2025  	case EXT4_FC_TAG_UNLINK:
2026  		len -= sizeof(struct ext4_fc_dentry_info);
2027  		return len >= 1 && len <= EXT4_NAME_LEN;
2028  	case EXT4_FC_TAG_INODE:
2029  		len -= sizeof(struct ext4_fc_inode);
2030  		return len >= EXT4_GOOD_OLD_INODE_SIZE &&
2031  			len <= sbi->s_inode_size;
2032  	case EXT4_FC_TAG_PAD:
2033  		return true; /* padding can have any length */
2034  	case EXT4_FC_TAG_TAIL:
2035  		return len >= sizeof(struct ext4_fc_tail);
2036  	case EXT4_FC_TAG_HEAD:
2037  		return len == sizeof(struct ext4_fc_head);
2038  	}
2039  	return false;
2040  }
2041  
2042  /*
2043   * Recovery Scan phase handler
2044   *
2045   * This function is called during the scan phase and is responsible
2046   * for doing following things:
2047   * - Make sure the fast commit area has valid tags for replay
2048   * - Count number of tags that need to be replayed by the replay handler
2049   * - Verify CRC
2050   * - Create a list of excluded blocks for allocation during replay phase
2051   *
2052   * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2053   * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2054   * to indicate that scan has finished and JBD2 can now start replay phase.
2055   * It returns a negative error to indicate that there was an error. At the end
2056   * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2057   * to indicate the number of tags that need to replayed during the replay phase.
2058   */
ext4_fc_replay_scan(journal_t * journal,struct buffer_head * bh,int off,tid_t expected_tid)2059  static int ext4_fc_replay_scan(journal_t *journal,
2060  				struct buffer_head *bh, int off,
2061  				tid_t expected_tid)
2062  {
2063  	struct super_block *sb = journal->j_private;
2064  	struct ext4_sb_info *sbi = EXT4_SB(sb);
2065  	struct ext4_fc_replay_state *state;
2066  	int ret = JBD2_FC_REPLAY_CONTINUE;
2067  	struct ext4_fc_add_range ext;
2068  	struct ext4_fc_tl_mem tl;
2069  	struct ext4_fc_tail tail;
2070  	__u8 *start, *end, *cur, *val;
2071  	struct ext4_fc_head head;
2072  	struct ext4_extent *ex;
2073  
2074  	state = &sbi->s_fc_replay_state;
2075  
2076  	start = (u8 *)bh->b_data;
2077  	end = start + journal->j_blocksize;
2078  
2079  	if (state->fc_replay_expected_off == 0) {
2080  		state->fc_cur_tag = 0;
2081  		state->fc_replay_num_tags = 0;
2082  		state->fc_crc = 0;
2083  		state->fc_regions = NULL;
2084  		state->fc_regions_valid = state->fc_regions_used =
2085  			state->fc_regions_size = 0;
2086  		/* Check if we can stop early */
2087  		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2088  			!= EXT4_FC_TAG_HEAD)
2089  			return 0;
2090  	}
2091  
2092  	if (off != state->fc_replay_expected_off) {
2093  		ret = -EFSCORRUPTED;
2094  		goto out_err;
2095  	}
2096  
2097  	state->fc_replay_expected_off++;
2098  	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2099  	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2100  		ext4_fc_get_tl(&tl, cur);
2101  		val = cur + EXT4_FC_TAG_BASE_LEN;
2102  		if (tl.fc_len > end - val ||
2103  		    !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
2104  			ret = state->fc_replay_num_tags ?
2105  				JBD2_FC_REPLAY_STOP : -ECANCELED;
2106  			goto out_err;
2107  		}
2108  		ext4_debug("Scan phase, tag:%s, blk %lld\n",
2109  			   tag2str(tl.fc_tag), bh->b_blocknr);
2110  		switch (tl.fc_tag) {
2111  		case EXT4_FC_TAG_ADD_RANGE:
2112  			memcpy(&ext, val, sizeof(ext));
2113  			ex = (struct ext4_extent *)&ext.fc_ex;
2114  			ret = ext4_fc_record_regions(sb,
2115  				le32_to_cpu(ext.fc_ino),
2116  				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2117  				ext4_ext_get_actual_len(ex), 0);
2118  			if (ret < 0)
2119  				break;
2120  			ret = JBD2_FC_REPLAY_CONTINUE;
2121  			fallthrough;
2122  		case EXT4_FC_TAG_DEL_RANGE:
2123  		case EXT4_FC_TAG_LINK:
2124  		case EXT4_FC_TAG_UNLINK:
2125  		case EXT4_FC_TAG_CREAT:
2126  		case EXT4_FC_TAG_INODE:
2127  		case EXT4_FC_TAG_PAD:
2128  			state->fc_cur_tag++;
2129  			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2130  				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2131  			break;
2132  		case EXT4_FC_TAG_TAIL:
2133  			state->fc_cur_tag++;
2134  			memcpy(&tail, val, sizeof(tail));
2135  			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2136  						EXT4_FC_TAG_BASE_LEN +
2137  						offsetof(struct ext4_fc_tail,
2138  						fc_crc));
2139  			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2140  				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2141  				state->fc_replay_num_tags = state->fc_cur_tag;
2142  				state->fc_regions_valid =
2143  					state->fc_regions_used;
2144  			} else {
2145  				ret = state->fc_replay_num_tags ?
2146  					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2147  			}
2148  			state->fc_crc = 0;
2149  			break;
2150  		case EXT4_FC_TAG_HEAD:
2151  			memcpy(&head, val, sizeof(head));
2152  			if (le32_to_cpu(head.fc_features) &
2153  				~EXT4_FC_SUPPORTED_FEATURES) {
2154  				ret = -EOPNOTSUPP;
2155  				break;
2156  			}
2157  			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2158  				ret = JBD2_FC_REPLAY_STOP;
2159  				break;
2160  			}
2161  			state->fc_cur_tag++;
2162  			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2163  				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2164  			break;
2165  		default:
2166  			ret = state->fc_replay_num_tags ?
2167  				JBD2_FC_REPLAY_STOP : -ECANCELED;
2168  		}
2169  		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2170  			break;
2171  	}
2172  
2173  out_err:
2174  	trace_ext4_fc_replay_scan(sb, ret, off);
2175  	return ret;
2176  }
2177  
2178  /*
2179   * Main recovery path entry point.
2180   * The meaning of return codes is similar as above.
2181   */
ext4_fc_replay(journal_t * journal,struct buffer_head * bh,enum passtype pass,int off,tid_t expected_tid)2182  static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2183  				enum passtype pass, int off, tid_t expected_tid)
2184  {
2185  	struct super_block *sb = journal->j_private;
2186  	struct ext4_sb_info *sbi = EXT4_SB(sb);
2187  	struct ext4_fc_tl_mem tl;
2188  	__u8 *start, *end, *cur, *val;
2189  	int ret = JBD2_FC_REPLAY_CONTINUE;
2190  	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2191  	struct ext4_fc_tail tail;
2192  
2193  	if (pass == PASS_SCAN) {
2194  		state->fc_current_pass = PASS_SCAN;
2195  		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2196  	}
2197  
2198  	if (state->fc_current_pass != pass) {
2199  		state->fc_current_pass = pass;
2200  		sbi->s_mount_state |= EXT4_FC_REPLAY;
2201  	}
2202  	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2203  		ext4_debug("Replay stops\n");
2204  		ext4_fc_set_bitmaps_and_counters(sb);
2205  		return 0;
2206  	}
2207  
2208  #ifdef CONFIG_EXT4_DEBUG
2209  	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2210  		pr_warn("Dropping fc block %d because max_replay set\n", off);
2211  		return JBD2_FC_REPLAY_STOP;
2212  	}
2213  #endif
2214  
2215  	start = (u8 *)bh->b_data;
2216  	end = start + journal->j_blocksize;
2217  
2218  	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2219  	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2220  		ext4_fc_get_tl(&tl, cur);
2221  		val = cur + EXT4_FC_TAG_BASE_LEN;
2222  
2223  		if (state->fc_replay_num_tags == 0) {
2224  			ret = JBD2_FC_REPLAY_STOP;
2225  			ext4_fc_set_bitmaps_and_counters(sb);
2226  			break;
2227  		}
2228  
2229  		ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
2230  		state->fc_replay_num_tags--;
2231  		switch (tl.fc_tag) {
2232  		case EXT4_FC_TAG_LINK:
2233  			ret = ext4_fc_replay_link(sb, &tl, val);
2234  			break;
2235  		case EXT4_FC_TAG_UNLINK:
2236  			ret = ext4_fc_replay_unlink(sb, &tl, val);
2237  			break;
2238  		case EXT4_FC_TAG_ADD_RANGE:
2239  			ret = ext4_fc_replay_add_range(sb, &tl, val);
2240  			break;
2241  		case EXT4_FC_TAG_CREAT:
2242  			ret = ext4_fc_replay_create(sb, &tl, val);
2243  			break;
2244  		case EXT4_FC_TAG_DEL_RANGE:
2245  			ret = ext4_fc_replay_del_range(sb, &tl, val);
2246  			break;
2247  		case EXT4_FC_TAG_INODE:
2248  			ret = ext4_fc_replay_inode(sb, &tl, val);
2249  			break;
2250  		case EXT4_FC_TAG_PAD:
2251  			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2252  					     tl.fc_len, 0);
2253  			break;
2254  		case EXT4_FC_TAG_TAIL:
2255  			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
2256  					     0, tl.fc_len, 0);
2257  			memcpy(&tail, val, sizeof(tail));
2258  			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2259  			break;
2260  		case EXT4_FC_TAG_HEAD:
2261  			break;
2262  		default:
2263  			trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
2264  			ret = -ECANCELED;
2265  			break;
2266  		}
2267  		if (ret < 0)
2268  			break;
2269  		ret = JBD2_FC_REPLAY_CONTINUE;
2270  	}
2271  	return ret;
2272  }
2273  
ext4_fc_init(struct super_block * sb,journal_t * journal)2274  void ext4_fc_init(struct super_block *sb, journal_t *journal)
2275  {
2276  	/*
2277  	 * We set replay callback even if fast commit disabled because we may
2278  	 * could still have fast commit blocks that need to be replayed even if
2279  	 * fast commit has now been turned off.
2280  	 */
2281  	journal->j_fc_replay_callback = ext4_fc_replay;
2282  	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2283  		return;
2284  	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2285  }
2286  
2287  static const char * const fc_ineligible_reasons[] = {
2288  	[EXT4_FC_REASON_XATTR] = "Extended attributes changed",
2289  	[EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
2290  	[EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
2291  	[EXT4_FC_REASON_NOMEM] = "Insufficient memory",
2292  	[EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
2293  	[EXT4_FC_REASON_RESIZE] = "Resize",
2294  	[EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
2295  	[EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
2296  	[EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
2297  	[EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
2298  };
2299  
ext4_fc_info_show(struct seq_file * seq,void * v)2300  int ext4_fc_info_show(struct seq_file *seq, void *v)
2301  {
2302  	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2303  	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2304  	int i;
2305  
2306  	if (v != SEQ_START_TOKEN)
2307  		return 0;
2308  
2309  	seq_printf(seq,
2310  		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2311  		   stats->fc_num_commits, stats->fc_ineligible_commits,
2312  		   stats->fc_numblks,
2313  		   div_u64(stats->s_fc_avg_commit_time, 1000));
2314  	seq_puts(seq, "Ineligible reasons:\n");
2315  	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2316  		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2317  			stats->fc_ineligible_reason_count[i]);
2318  
2319  	return 0;
2320  }
2321  
ext4_fc_init_dentry_cache(void)2322  int __init ext4_fc_init_dentry_cache(void)
2323  {
2324  	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2325  					   SLAB_RECLAIM_ACCOUNT);
2326  
2327  	if (ext4_fc_dentry_cachep == NULL)
2328  		return -ENOMEM;
2329  
2330  	return 0;
2331  }
2332  
ext4_fc_destroy_dentry_cache(void)2333  void ext4_fc_destroy_dentry_cache(void)
2334  {
2335  	kmem_cache_destroy(ext4_fc_dentry_cachep);
2336  }
2337