1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *  linux/fs/ext4/ialloc.c
4   *
5   * Copyright (C) 1992, 1993, 1994, 1995
6   * Remy Card (card@masi.ibp.fr)
7   * Laboratoire MASI - Institut Blaise Pascal
8   * Universite Pierre et Marie Curie (Paris VI)
9   *
10   *  BSD ufs-inspired inode and directory allocation by
11   *  Stephen Tweedie (sct@redhat.com), 1993
12   *  Big-endian to little-endian byte-swapping/bitmaps by
13   *        David S. Miller (davem@caip.rutgers.edu), 1995
14   */
15  
16  #include <linux/time.h>
17  #include <linux/fs.h>
18  #include <linux/stat.h>
19  #include <linux/string.h>
20  #include <linux/quotaops.h>
21  #include <linux/buffer_head.h>
22  #include <linux/random.h>
23  #include <linux/bitops.h>
24  #include <linux/blkdev.h>
25  #include <linux/cred.h>
26  
27  #include <asm/byteorder.h>
28  
29  #include "ext4.h"
30  #include "ext4_jbd2.h"
31  #include "xattr.h"
32  #include "acl.h"
33  
34  #include <trace/events/ext4.h>
35  
36  /*
37   * ialloc.c contains the inodes allocation and deallocation routines
38   */
39  
40  /*
41   * The free inodes are managed by bitmaps.  A file system contains several
42   * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
43   * block for inodes, N blocks for the inode table and data blocks.
44   *
45   * The file system contains group descriptors which are located after the
46   * super block.  Each descriptor contains the number of the bitmap block and
47   * the free blocks count in the block.
48   */
49  
50  /*
51   * To avoid calling the atomic setbit hundreds or thousands of times, we only
52   * need to use it within a single byte (to ensure we get endianness right).
53   * We can use memset for the rest of the bitmap as there are no other users.
54   */
ext4_mark_bitmap_end(int start_bit,int end_bit,char * bitmap)55  void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
56  {
57  	int i;
58  
59  	if (start_bit >= end_bit)
60  		return;
61  
62  	ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
63  	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
64  		ext4_set_bit(i, bitmap);
65  	if (i < end_bit)
66  		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
67  }
68  
ext4_end_bitmap_read(struct buffer_head * bh,int uptodate)69  void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
70  {
71  	if (uptodate) {
72  		set_buffer_uptodate(bh);
73  		set_bitmap_uptodate(bh);
74  	}
75  	unlock_buffer(bh);
76  	put_bh(bh);
77  }
78  
ext4_validate_inode_bitmap(struct super_block * sb,struct ext4_group_desc * desc,ext4_group_t block_group,struct buffer_head * bh)79  static int ext4_validate_inode_bitmap(struct super_block *sb,
80  				      struct ext4_group_desc *desc,
81  				      ext4_group_t block_group,
82  				      struct buffer_head *bh)
83  {
84  	ext4_fsblk_t	blk;
85  	struct ext4_group_info *grp;
86  
87  	if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
88  		return 0;
89  
90  	if (buffer_verified(bh))
91  		return 0;
92  
93  	grp = ext4_get_group_info(sb, block_group);
94  	if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
95  		return -EFSCORRUPTED;
96  
97  	ext4_lock_group(sb, block_group);
98  	if (buffer_verified(bh))
99  		goto verified;
100  	blk = ext4_inode_bitmap(sb, desc);
101  	if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) ||
102  	    ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
103  		ext4_unlock_group(sb, block_group);
104  		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
105  			   "inode_bitmap = %llu", block_group, blk);
106  		ext4_mark_group_bitmap_corrupted(sb, block_group,
107  					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
108  		return -EFSBADCRC;
109  	}
110  	set_buffer_verified(bh);
111  verified:
112  	ext4_unlock_group(sb, block_group);
113  	return 0;
114  }
115  
116  /*
117   * Read the inode allocation bitmap for a given block_group, reading
118   * into the specified slot in the superblock's bitmap cache.
119   *
120   * Return buffer_head of bitmap on success, or an ERR_PTR on error.
121   */
122  static struct buffer_head *
ext4_read_inode_bitmap(struct super_block * sb,ext4_group_t block_group)123  ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
124  {
125  	struct ext4_group_desc *desc;
126  	struct ext4_sb_info *sbi = EXT4_SB(sb);
127  	struct buffer_head *bh = NULL;
128  	ext4_fsblk_t bitmap_blk;
129  	int err;
130  
131  	desc = ext4_get_group_desc(sb, block_group, NULL);
132  	if (!desc)
133  		return ERR_PTR(-EFSCORRUPTED);
134  
135  	bitmap_blk = ext4_inode_bitmap(sb, desc);
136  	if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
137  	    (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
138  		ext4_error(sb, "Invalid inode bitmap blk %llu in "
139  			   "block_group %u", bitmap_blk, block_group);
140  		ext4_mark_group_bitmap_corrupted(sb, block_group,
141  					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
142  		return ERR_PTR(-EFSCORRUPTED);
143  	}
144  	bh = sb_getblk(sb, bitmap_blk);
145  	if (unlikely(!bh)) {
146  		ext4_warning(sb, "Cannot read inode bitmap - "
147  			     "block_group = %u, inode_bitmap = %llu",
148  			     block_group, bitmap_blk);
149  		return ERR_PTR(-ENOMEM);
150  	}
151  	if (bitmap_uptodate(bh))
152  		goto verify;
153  
154  	lock_buffer(bh);
155  	if (bitmap_uptodate(bh)) {
156  		unlock_buffer(bh);
157  		goto verify;
158  	}
159  
160  	ext4_lock_group(sb, block_group);
161  	if (ext4_has_group_desc_csum(sb) &&
162  	    (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
163  		if (block_group == 0) {
164  			ext4_unlock_group(sb, block_group);
165  			unlock_buffer(bh);
166  			ext4_error(sb, "Inode bitmap for bg 0 marked "
167  				   "uninitialized");
168  			err = -EFSCORRUPTED;
169  			goto out;
170  		}
171  		memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
172  		ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
173  				     sb->s_blocksize * 8, bh->b_data);
174  		set_bitmap_uptodate(bh);
175  		set_buffer_uptodate(bh);
176  		set_buffer_verified(bh);
177  		ext4_unlock_group(sb, block_group);
178  		unlock_buffer(bh);
179  		return bh;
180  	}
181  	ext4_unlock_group(sb, block_group);
182  
183  	if (buffer_uptodate(bh)) {
184  		/*
185  		 * if not uninit if bh is uptodate,
186  		 * bitmap is also uptodate
187  		 */
188  		set_bitmap_uptodate(bh);
189  		unlock_buffer(bh);
190  		goto verify;
191  	}
192  	/*
193  	 * submit the buffer_head for reading
194  	 */
195  	trace_ext4_load_inode_bitmap(sb, block_group);
196  	ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read);
197  	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
198  	if (!buffer_uptodate(bh)) {
199  		put_bh(bh);
200  		ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
201  			       "block_group = %u, inode_bitmap = %llu",
202  			       block_group, bitmap_blk);
203  		ext4_mark_group_bitmap_corrupted(sb, block_group,
204  				EXT4_GROUP_INFO_IBITMAP_CORRUPT);
205  		return ERR_PTR(-EIO);
206  	}
207  
208  verify:
209  	err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
210  	if (err)
211  		goto out;
212  	return bh;
213  out:
214  	put_bh(bh);
215  	return ERR_PTR(err);
216  }
217  
218  /*
219   * NOTE! When we get the inode, we're the only people
220   * that have access to it, and as such there are no
221   * race conditions we have to worry about. The inode
222   * is not on the hash-lists, and it cannot be reached
223   * through the filesystem because the directory entry
224   * has been deleted earlier.
225   *
226   * HOWEVER: we must make sure that we get no aliases,
227   * which means that we have to call "clear_inode()"
228   * _before_ we mark the inode not in use in the inode
229   * bitmaps. Otherwise a newly created file might use
230   * the same inode number (not actually the same pointer
231   * though), and then we'd have two inodes sharing the
232   * same inode number and space on the harddisk.
233   */
ext4_free_inode(handle_t * handle,struct inode * inode)234  void ext4_free_inode(handle_t *handle, struct inode *inode)
235  {
236  	struct super_block *sb = inode->i_sb;
237  	int is_directory;
238  	unsigned long ino;
239  	struct buffer_head *bitmap_bh = NULL;
240  	struct buffer_head *bh2;
241  	ext4_group_t block_group;
242  	unsigned long bit;
243  	struct ext4_group_desc *gdp;
244  	struct ext4_super_block *es;
245  	struct ext4_sb_info *sbi;
246  	int fatal = 0, err, count, cleared;
247  	struct ext4_group_info *grp;
248  
249  	if (!sb) {
250  		printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
251  		       "nonexistent device\n", __func__, __LINE__);
252  		return;
253  	}
254  	if (atomic_read(&inode->i_count) > 1) {
255  		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
256  			 __func__, __LINE__, inode->i_ino,
257  			 atomic_read(&inode->i_count));
258  		return;
259  	}
260  	if (inode->i_nlink) {
261  		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
262  			 __func__, __LINE__, inode->i_ino, inode->i_nlink);
263  		return;
264  	}
265  	sbi = EXT4_SB(sb);
266  
267  	ino = inode->i_ino;
268  	ext4_debug("freeing inode %lu\n", ino);
269  	trace_ext4_free_inode(inode);
270  
271  	dquot_initialize(inode);
272  	dquot_free_inode(inode);
273  
274  	is_directory = S_ISDIR(inode->i_mode);
275  
276  	/* Do this BEFORE marking the inode not in use or returning an error */
277  	ext4_clear_inode(inode);
278  
279  	es = sbi->s_es;
280  	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
281  		ext4_error(sb, "reserved or nonexistent inode %lu", ino);
282  		goto error_return;
283  	}
284  	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
285  	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
286  	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
287  	/* Don't bother if the inode bitmap is corrupt. */
288  	if (IS_ERR(bitmap_bh)) {
289  		fatal = PTR_ERR(bitmap_bh);
290  		bitmap_bh = NULL;
291  		goto error_return;
292  	}
293  	if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
294  		grp = ext4_get_group_info(sb, block_group);
295  		if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
296  			fatal = -EFSCORRUPTED;
297  			goto error_return;
298  		}
299  	}
300  
301  	BUFFER_TRACE(bitmap_bh, "get_write_access");
302  	fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh,
303  					      EXT4_JTR_NONE);
304  	if (fatal)
305  		goto error_return;
306  
307  	fatal = -ESRCH;
308  	gdp = ext4_get_group_desc(sb, block_group, &bh2);
309  	if (gdp) {
310  		BUFFER_TRACE(bh2, "get_write_access");
311  		fatal = ext4_journal_get_write_access(handle, sb, bh2,
312  						      EXT4_JTR_NONE);
313  	}
314  	ext4_lock_group(sb, block_group);
315  	cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
316  	if (fatal || !cleared) {
317  		ext4_unlock_group(sb, block_group);
318  		goto out;
319  	}
320  
321  	count = ext4_free_inodes_count(sb, gdp) + 1;
322  	ext4_free_inodes_set(sb, gdp, count);
323  	if (is_directory) {
324  		count = ext4_used_dirs_count(sb, gdp) - 1;
325  		ext4_used_dirs_set(sb, gdp, count);
326  		if (percpu_counter_initialized(&sbi->s_dirs_counter))
327  			percpu_counter_dec(&sbi->s_dirs_counter);
328  	}
329  	ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh);
330  	ext4_group_desc_csum_set(sb, block_group, gdp);
331  	ext4_unlock_group(sb, block_group);
332  
333  	if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
334  		percpu_counter_inc(&sbi->s_freeinodes_counter);
335  	if (sbi->s_log_groups_per_flex) {
336  		struct flex_groups *fg;
337  
338  		fg = sbi_array_rcu_deref(sbi, s_flex_groups,
339  					 ext4_flex_group(sbi, block_group));
340  		atomic_inc(&fg->free_inodes);
341  		if (is_directory)
342  			atomic_dec(&fg->used_dirs);
343  	}
344  	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
345  	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
346  out:
347  	if (cleared) {
348  		BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
349  		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
350  		if (!fatal)
351  			fatal = err;
352  	} else {
353  		ext4_error(sb, "bit already cleared for inode %lu", ino);
354  		ext4_mark_group_bitmap_corrupted(sb, block_group,
355  					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
356  	}
357  
358  error_return:
359  	brelse(bitmap_bh);
360  	ext4_std_error(sb, fatal);
361  }
362  
363  struct orlov_stats {
364  	__u64 free_clusters;
365  	__u32 free_inodes;
366  	__u32 used_dirs;
367  };
368  
369  /*
370   * Helper function for Orlov's allocator; returns critical information
371   * for a particular block group or flex_bg.  If flex_size is 1, then g
372   * is a block group number; otherwise it is flex_bg number.
373   */
get_orlov_stats(struct super_block * sb,ext4_group_t g,int flex_size,struct orlov_stats * stats)374  static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
375  			    int flex_size, struct orlov_stats *stats)
376  {
377  	struct ext4_group_desc *desc;
378  
379  	if (flex_size > 1) {
380  		struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb),
381  							     s_flex_groups, g);
382  		stats->free_inodes = atomic_read(&fg->free_inodes);
383  		stats->free_clusters = atomic64_read(&fg->free_clusters);
384  		stats->used_dirs = atomic_read(&fg->used_dirs);
385  		return;
386  	}
387  
388  	desc = ext4_get_group_desc(sb, g, NULL);
389  	if (desc) {
390  		stats->free_inodes = ext4_free_inodes_count(sb, desc);
391  		stats->free_clusters = ext4_free_group_clusters(sb, desc);
392  		stats->used_dirs = ext4_used_dirs_count(sb, desc);
393  	} else {
394  		stats->free_inodes = 0;
395  		stats->free_clusters = 0;
396  		stats->used_dirs = 0;
397  	}
398  }
399  
400  /*
401   * Orlov's allocator for directories.
402   *
403   * We always try to spread first-level directories.
404   *
405   * If there are blockgroups with both free inodes and free clusters counts
406   * not worse than average we return one with smallest directory count.
407   * Otherwise we simply return a random group.
408   *
409   * For the rest rules look so:
410   *
411   * It's OK to put directory into a group unless
412   * it has too many directories already (max_dirs) or
413   * it has too few free inodes left (min_inodes) or
414   * it has too few free clusters left (min_clusters) or
415   * Parent's group is preferred, if it doesn't satisfy these
416   * conditions we search cyclically through the rest. If none
417   * of the groups look good we just look for a group with more
418   * free inodes than average (starting at parent's group).
419   */
420  
find_group_orlov(struct super_block * sb,struct inode * parent,ext4_group_t * group,umode_t mode,const struct qstr * qstr)421  static int find_group_orlov(struct super_block *sb, struct inode *parent,
422  			    ext4_group_t *group, umode_t mode,
423  			    const struct qstr *qstr)
424  {
425  	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
426  	struct ext4_sb_info *sbi = EXT4_SB(sb);
427  	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
428  	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
429  	unsigned int freei, avefreei, grp_free;
430  	ext4_fsblk_t freec, avefreec;
431  	unsigned int ndirs;
432  	int max_dirs, min_inodes;
433  	ext4_grpblk_t min_clusters;
434  	ext4_group_t i, grp, g, ngroups;
435  	struct ext4_group_desc *desc;
436  	struct orlov_stats stats;
437  	int flex_size = ext4_flex_bg_size(sbi);
438  	struct dx_hash_info hinfo;
439  
440  	ngroups = real_ngroups;
441  	if (flex_size > 1) {
442  		ngroups = (real_ngroups + flex_size - 1) >>
443  			sbi->s_log_groups_per_flex;
444  		parent_group >>= sbi->s_log_groups_per_flex;
445  	}
446  
447  	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
448  	avefreei = freei / ngroups;
449  	freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter);
450  	avefreec = freec;
451  	do_div(avefreec, ngroups);
452  	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
453  
454  	if (S_ISDIR(mode) &&
455  	    ((parent == d_inode(sb->s_root)) ||
456  	     (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
457  		int best_ndir = inodes_per_group;
458  		int ret = -1;
459  
460  		if (qstr) {
461  			hinfo.hash_version = DX_HASH_HALF_MD4;
462  			hinfo.seed = sbi->s_hash_seed;
463  			ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
464  			parent_group = hinfo.hash % ngroups;
465  		} else
466  			parent_group = get_random_u32_below(ngroups);
467  		for (i = 0; i < ngroups; i++) {
468  			g = (parent_group + i) % ngroups;
469  			get_orlov_stats(sb, g, flex_size, &stats);
470  			if (!stats.free_inodes)
471  				continue;
472  			if (stats.used_dirs >= best_ndir)
473  				continue;
474  			if (stats.free_inodes < avefreei)
475  				continue;
476  			if (stats.free_clusters < avefreec)
477  				continue;
478  			grp = g;
479  			ret = 0;
480  			best_ndir = stats.used_dirs;
481  		}
482  		if (ret)
483  			goto fallback;
484  	found_flex_bg:
485  		if (flex_size == 1) {
486  			*group = grp;
487  			return 0;
488  		}
489  
490  		/*
491  		 * We pack inodes at the beginning of the flexgroup's
492  		 * inode tables.  Block allocation decisions will do
493  		 * something similar, although regular files will
494  		 * start at 2nd block group of the flexgroup.  See
495  		 * ext4_ext_find_goal() and ext4_find_near().
496  		 */
497  		grp *= flex_size;
498  		for (i = 0; i < flex_size; i++) {
499  			if (grp+i >= real_ngroups)
500  				break;
501  			desc = ext4_get_group_desc(sb, grp+i, NULL);
502  			if (desc && ext4_free_inodes_count(sb, desc)) {
503  				*group = grp+i;
504  				return 0;
505  			}
506  		}
507  		goto fallback;
508  	}
509  
510  	max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
511  	min_inodes = avefreei - inodes_per_group*flex_size / 4;
512  	if (min_inodes < 1)
513  		min_inodes = 1;
514  	min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
515  	if (min_clusters < 0)
516  		min_clusters = 0;
517  
518  	/*
519  	 * Start looking in the flex group where we last allocated an
520  	 * inode for this parent directory
521  	 */
522  	if (EXT4_I(parent)->i_last_alloc_group != ~0) {
523  		parent_group = EXT4_I(parent)->i_last_alloc_group;
524  		if (flex_size > 1)
525  			parent_group >>= sbi->s_log_groups_per_flex;
526  	}
527  
528  	for (i = 0; i < ngroups; i++) {
529  		grp = (parent_group + i) % ngroups;
530  		get_orlov_stats(sb, grp, flex_size, &stats);
531  		if (stats.used_dirs >= max_dirs)
532  			continue;
533  		if (stats.free_inodes < min_inodes)
534  			continue;
535  		if (stats.free_clusters < min_clusters)
536  			continue;
537  		goto found_flex_bg;
538  	}
539  
540  fallback:
541  	ngroups = real_ngroups;
542  	avefreei = freei / ngroups;
543  fallback_retry:
544  	parent_group = EXT4_I(parent)->i_block_group;
545  	for (i = 0; i < ngroups; i++) {
546  		grp = (parent_group + i) % ngroups;
547  		desc = ext4_get_group_desc(sb, grp, NULL);
548  		if (desc) {
549  			grp_free = ext4_free_inodes_count(sb, desc);
550  			if (grp_free && grp_free >= avefreei) {
551  				*group = grp;
552  				return 0;
553  			}
554  		}
555  	}
556  
557  	if (avefreei) {
558  		/*
559  		 * The free-inodes counter is approximate, and for really small
560  		 * filesystems the above test can fail to find any blockgroups
561  		 */
562  		avefreei = 0;
563  		goto fallback_retry;
564  	}
565  
566  	return -1;
567  }
568  
find_group_other(struct super_block * sb,struct inode * parent,ext4_group_t * group,umode_t mode)569  static int find_group_other(struct super_block *sb, struct inode *parent,
570  			    ext4_group_t *group, umode_t mode)
571  {
572  	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
573  	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
574  	struct ext4_group_desc *desc;
575  	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
576  
577  	/*
578  	 * Try to place the inode is the same flex group as its
579  	 * parent.  If we can't find space, use the Orlov algorithm to
580  	 * find another flex group, and store that information in the
581  	 * parent directory's inode information so that use that flex
582  	 * group for future allocations.
583  	 */
584  	if (flex_size > 1) {
585  		int retry = 0;
586  
587  	try_again:
588  		parent_group &= ~(flex_size-1);
589  		last = parent_group + flex_size;
590  		if (last > ngroups)
591  			last = ngroups;
592  		for  (i = parent_group; i < last; i++) {
593  			desc = ext4_get_group_desc(sb, i, NULL);
594  			if (desc && ext4_free_inodes_count(sb, desc)) {
595  				*group = i;
596  				return 0;
597  			}
598  		}
599  		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
600  			retry = 1;
601  			parent_group = EXT4_I(parent)->i_last_alloc_group;
602  			goto try_again;
603  		}
604  		/*
605  		 * If this didn't work, use the Orlov search algorithm
606  		 * to find a new flex group; we pass in the mode to
607  		 * avoid the topdir algorithms.
608  		 */
609  		*group = parent_group + flex_size;
610  		if (*group > ngroups)
611  			*group = 0;
612  		return find_group_orlov(sb, parent, group, mode, NULL);
613  	}
614  
615  	/*
616  	 * Try to place the inode in its parent directory
617  	 */
618  	*group = parent_group;
619  	desc = ext4_get_group_desc(sb, *group, NULL);
620  	if (desc && ext4_free_inodes_count(sb, desc) &&
621  	    ext4_free_group_clusters(sb, desc))
622  		return 0;
623  
624  	/*
625  	 * We're going to place this inode in a different blockgroup from its
626  	 * parent.  We want to cause files in a common directory to all land in
627  	 * the same blockgroup.  But we want files which are in a different
628  	 * directory which shares a blockgroup with our parent to land in a
629  	 * different blockgroup.
630  	 *
631  	 * So add our directory's i_ino into the starting point for the hash.
632  	 */
633  	*group = (*group + parent->i_ino) % ngroups;
634  
635  	/*
636  	 * Use a quadratic hash to find a group with a free inode and some free
637  	 * blocks.
638  	 */
639  	for (i = 1; i < ngroups; i <<= 1) {
640  		*group += i;
641  		if (*group >= ngroups)
642  			*group -= ngroups;
643  		desc = ext4_get_group_desc(sb, *group, NULL);
644  		if (desc && ext4_free_inodes_count(sb, desc) &&
645  		    ext4_free_group_clusters(sb, desc))
646  			return 0;
647  	}
648  
649  	/*
650  	 * That failed: try linear search for a free inode, even if that group
651  	 * has no free blocks.
652  	 */
653  	*group = parent_group;
654  	for (i = 0; i < ngroups; i++) {
655  		if (++*group >= ngroups)
656  			*group = 0;
657  		desc = ext4_get_group_desc(sb, *group, NULL);
658  		if (desc && ext4_free_inodes_count(sb, desc))
659  			return 0;
660  	}
661  
662  	return -1;
663  }
664  
665  /*
666   * In no journal mode, if an inode has recently been deleted, we want
667   * to avoid reusing it until we're reasonably sure the inode table
668   * block has been written back to disk.  (Yes, these values are
669   * somewhat arbitrary...)
670   */
671  #define RECENTCY_MIN	60
672  #define RECENTCY_DIRTY	300
673  
recently_deleted(struct super_block * sb,ext4_group_t group,int ino)674  static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
675  {
676  	struct ext4_group_desc	*gdp;
677  	struct ext4_inode	*raw_inode;
678  	struct buffer_head	*bh;
679  	int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
680  	int offset, ret = 0;
681  	int recentcy = RECENTCY_MIN;
682  	u32 dtime, now;
683  
684  	gdp = ext4_get_group_desc(sb, group, NULL);
685  	if (unlikely(!gdp))
686  		return 0;
687  
688  	bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) +
689  		       (ino / inodes_per_block));
690  	if (!bh || !buffer_uptodate(bh))
691  		/*
692  		 * If the block is not in the buffer cache, then it
693  		 * must have been written out.
694  		 */
695  		goto out;
696  
697  	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
698  	raw_inode = (struct ext4_inode *) (bh->b_data + offset);
699  
700  	/* i_dtime is only 32 bits on disk, but we only care about relative
701  	 * times in the range of a few minutes (i.e. long enough to sync a
702  	 * recently-deleted inode to disk), so using the low 32 bits of the
703  	 * clock (a 68 year range) is enough, see time_before32() */
704  	dtime = le32_to_cpu(raw_inode->i_dtime);
705  	now = ktime_get_real_seconds();
706  	if (buffer_dirty(bh))
707  		recentcy += RECENTCY_DIRTY;
708  
709  	if (dtime && time_before32(dtime, now) &&
710  	    time_before32(now, dtime + recentcy))
711  		ret = 1;
712  out:
713  	brelse(bh);
714  	return ret;
715  }
716  
find_inode_bit(struct super_block * sb,ext4_group_t group,struct buffer_head * bitmap,unsigned long * ino)717  static int find_inode_bit(struct super_block *sb, ext4_group_t group,
718  			  struct buffer_head *bitmap, unsigned long *ino)
719  {
720  	bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL;
721  	unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb);
722  
723  next:
724  	*ino = ext4_find_next_zero_bit((unsigned long *)
725  				       bitmap->b_data,
726  				       EXT4_INODES_PER_GROUP(sb), *ino);
727  	if (*ino >= EXT4_INODES_PER_GROUP(sb))
728  		goto not_found;
729  
730  	if (check_recently_deleted && recently_deleted(sb, group, *ino)) {
731  		recently_deleted_ino = *ino;
732  		*ino = *ino + 1;
733  		if (*ino < EXT4_INODES_PER_GROUP(sb))
734  			goto next;
735  		goto not_found;
736  	}
737  	return 1;
738  not_found:
739  	if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb))
740  		return 0;
741  	/*
742  	 * Not reusing recently deleted inodes is mostly a preference. We don't
743  	 * want to report ENOSPC or skew allocation patterns because of that.
744  	 * So return even recently deleted inode if we could find better in the
745  	 * given range.
746  	 */
747  	*ino = recently_deleted_ino;
748  	return 1;
749  }
750  
ext4_mark_inode_used(struct super_block * sb,int ino)751  int ext4_mark_inode_used(struct super_block *sb, int ino)
752  {
753  	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
754  	struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
755  	struct ext4_group_desc *gdp;
756  	ext4_group_t group;
757  	int bit;
758  	int err;
759  
760  	if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
761  		return -EFSCORRUPTED;
762  
763  	group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
764  	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
765  	inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
766  	if (IS_ERR(inode_bitmap_bh))
767  		return PTR_ERR(inode_bitmap_bh);
768  
769  	if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
770  		err = 0;
771  		goto out;
772  	}
773  
774  	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
775  	if (!gdp) {
776  		err = -EINVAL;
777  		goto out;
778  	}
779  
780  	ext4_set_bit(bit, inode_bitmap_bh->b_data);
781  
782  	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
783  	err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
784  	if (err) {
785  		ext4_std_error(sb, err);
786  		goto out;
787  	}
788  	err = sync_dirty_buffer(inode_bitmap_bh);
789  	if (err) {
790  		ext4_std_error(sb, err);
791  		goto out;
792  	}
793  
794  	/* We may have to initialize the block bitmap if it isn't already */
795  	if (ext4_has_group_desc_csum(sb) &&
796  	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
797  		struct buffer_head *block_bitmap_bh;
798  
799  		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
800  		if (IS_ERR(block_bitmap_bh)) {
801  			err = PTR_ERR(block_bitmap_bh);
802  			goto out;
803  		}
804  
805  		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
806  		err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
807  		sync_dirty_buffer(block_bitmap_bh);
808  
809  		/* recheck and clear flag under lock if we still need to */
810  		ext4_lock_group(sb, group);
811  		if (ext4_has_group_desc_csum(sb) &&
812  		    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
813  			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
814  			ext4_free_group_clusters_set(sb, gdp,
815  				ext4_free_clusters_after_init(sb, group, gdp));
816  			ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
817  			ext4_group_desc_csum_set(sb, group, gdp);
818  		}
819  		ext4_unlock_group(sb, group);
820  		brelse(block_bitmap_bh);
821  
822  		if (err) {
823  			ext4_std_error(sb, err);
824  			goto out;
825  		}
826  	}
827  
828  	/* Update the relevant bg descriptor fields */
829  	if (ext4_has_group_desc_csum(sb)) {
830  		int free;
831  
832  		ext4_lock_group(sb, group); /* while we modify the bg desc */
833  		free = EXT4_INODES_PER_GROUP(sb) -
834  			ext4_itable_unused_count(sb, gdp);
835  		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
836  			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
837  			free = 0;
838  		}
839  
840  		/*
841  		 * Check the relative inode number against the last used
842  		 * relative inode number in this group. if it is greater
843  		 * we need to update the bg_itable_unused count
844  		 */
845  		if (bit >= free)
846  			ext4_itable_unused_set(sb, gdp,
847  					(EXT4_INODES_PER_GROUP(sb) - bit - 1));
848  	} else {
849  		ext4_lock_group(sb, group);
850  	}
851  
852  	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
853  	if (ext4_has_group_desc_csum(sb)) {
854  		ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
855  		ext4_group_desc_csum_set(sb, group, gdp);
856  	}
857  
858  	ext4_unlock_group(sb, group);
859  	err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
860  	sync_dirty_buffer(group_desc_bh);
861  out:
862  	brelse(inode_bitmap_bh);
863  	return err;
864  }
865  
ext4_xattr_credits_for_new_inode(struct inode * dir,mode_t mode,bool encrypt)866  static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
867  					    bool encrypt)
868  {
869  	struct super_block *sb = dir->i_sb;
870  	int nblocks = 0;
871  #ifdef CONFIG_EXT4_FS_POSIX_ACL
872  	struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
873  
874  	if (IS_ERR(p))
875  		return PTR_ERR(p);
876  	if (p) {
877  		int acl_size = p->a_count * sizeof(ext4_acl_entry);
878  
879  		nblocks += (S_ISDIR(mode) ? 2 : 1) *
880  			__ext4_xattr_set_credits(sb, NULL /* inode */,
881  						 NULL /* block_bh */, acl_size,
882  						 true /* is_create */);
883  		posix_acl_release(p);
884  	}
885  #endif
886  
887  #ifdef CONFIG_SECURITY
888  	{
889  		int num_security_xattrs = 1;
890  
891  #ifdef CONFIG_INTEGRITY
892  		num_security_xattrs++;
893  #endif
894  		/*
895  		 * We assume that security xattrs are never more than 1k.
896  		 * In practice they are under 128 bytes.
897  		 */
898  		nblocks += num_security_xattrs *
899  			__ext4_xattr_set_credits(sb, NULL /* inode */,
900  						 NULL /* block_bh */, 1024,
901  						 true /* is_create */);
902  	}
903  #endif
904  	if (encrypt)
905  		nblocks += __ext4_xattr_set_credits(sb,
906  						    NULL /* inode */,
907  						    NULL /* block_bh */,
908  						    FSCRYPT_SET_CONTEXT_MAX_SIZE,
909  						    true /* is_create */);
910  	return nblocks;
911  }
912  
913  /*
914   * There are two policies for allocating an inode.  If the new inode is
915   * a directory, then a forward search is made for a block group with both
916   * free space and a low directory-to-inode ratio; if that fails, then of
917   * the groups with above-average free space, that group with the fewest
918   * directories already is chosen.
919   *
920   * For other inodes, search forward from the parent directory's block
921   * group to find a free inode.
922   */
__ext4_new_inode(struct mnt_idmap * idmap,handle_t * handle,struct inode * dir,umode_t mode,const struct qstr * qstr,__u32 goal,uid_t * owner,__u32 i_flags,int handle_type,unsigned int line_no,int nblocks)923  struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
924  			       handle_t *handle, struct inode *dir,
925  			       umode_t mode, const struct qstr *qstr,
926  			       __u32 goal, uid_t *owner, __u32 i_flags,
927  			       int handle_type, unsigned int line_no,
928  			       int nblocks)
929  {
930  	struct super_block *sb;
931  	struct buffer_head *inode_bitmap_bh = NULL;
932  	struct buffer_head *group_desc_bh;
933  	ext4_group_t ngroups, group = 0;
934  	unsigned long ino = 0;
935  	struct inode *inode;
936  	struct ext4_group_desc *gdp = NULL;
937  	struct ext4_inode_info *ei;
938  	struct ext4_sb_info *sbi;
939  	int ret2, err;
940  	struct inode *ret;
941  	ext4_group_t i;
942  	ext4_group_t flex_group;
943  	struct ext4_group_info *grp = NULL;
944  	bool encrypt = false;
945  
946  	/* Cannot create files in a deleted directory */
947  	if (!dir || !dir->i_nlink)
948  		return ERR_PTR(-EPERM);
949  
950  	sb = dir->i_sb;
951  	sbi = EXT4_SB(sb);
952  
953  	if (unlikely(ext4_forced_shutdown(sb)))
954  		return ERR_PTR(-EIO);
955  
956  	ngroups = ext4_get_groups_count(sb);
957  	trace_ext4_request_inode(dir, mode);
958  	inode = new_inode(sb);
959  	if (!inode)
960  		return ERR_PTR(-ENOMEM);
961  	ei = EXT4_I(inode);
962  
963  	/*
964  	 * Initialize owners and quota early so that we don't have to account
965  	 * for quota initialization worst case in standard inode creating
966  	 * transaction
967  	 */
968  	if (owner) {
969  		inode->i_mode = mode;
970  		i_uid_write(inode, owner[0]);
971  		i_gid_write(inode, owner[1]);
972  	} else if (test_opt(sb, GRPID)) {
973  		inode->i_mode = mode;
974  		inode_fsuid_set(inode, idmap);
975  		inode->i_gid = dir->i_gid;
976  	} else
977  		inode_init_owner(idmap, inode, dir, mode);
978  
979  	if (ext4_has_feature_project(sb) &&
980  	    ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
981  		ei->i_projid = EXT4_I(dir)->i_projid;
982  	else
983  		ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
984  
985  	if (!(i_flags & EXT4_EA_INODE_FL)) {
986  		err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
987  		if (err)
988  			goto out;
989  	}
990  
991  	err = dquot_initialize(inode);
992  	if (err)
993  		goto out;
994  
995  	if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
996  		ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
997  		if (ret2 < 0) {
998  			err = ret2;
999  			goto out;
1000  		}
1001  		nblocks += ret2;
1002  	}
1003  
1004  	if (!goal)
1005  		goal = sbi->s_inode_goal;
1006  
1007  	if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
1008  		group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
1009  		ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
1010  		ret2 = 0;
1011  		goto got_group;
1012  	}
1013  
1014  	if (S_ISDIR(mode))
1015  		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
1016  	else
1017  		ret2 = find_group_other(sb, dir, &group, mode);
1018  
1019  got_group:
1020  	EXT4_I(dir)->i_last_alloc_group = group;
1021  	err = -ENOSPC;
1022  	if (ret2 == -1)
1023  		goto out;
1024  
1025  	/*
1026  	 * Normally we will only go through one pass of this loop,
1027  	 * unless we get unlucky and it turns out the group we selected
1028  	 * had its last inode grabbed by someone else.
1029  	 */
1030  	for (i = 0; i < ngroups; i++, ino = 0) {
1031  		err = -EIO;
1032  
1033  		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
1034  		if (!gdp)
1035  			goto out;
1036  
1037  		/*
1038  		 * Check free inodes count before loading bitmap.
1039  		 */
1040  		if (ext4_free_inodes_count(sb, gdp) == 0)
1041  			goto next_group;
1042  
1043  		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
1044  			grp = ext4_get_group_info(sb, group);
1045  			/*
1046  			 * Skip groups with already-known suspicious inode
1047  			 * tables
1048  			 */
1049  			if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
1050  				goto next_group;
1051  		}
1052  
1053  		brelse(inode_bitmap_bh);
1054  		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
1055  		/* Skip groups with suspicious inode tables */
1056  		if (IS_ERR(inode_bitmap_bh)) {
1057  			inode_bitmap_bh = NULL;
1058  			goto next_group;
1059  		}
1060  		if (!(sbi->s_mount_state & EXT4_FC_REPLAY) &&
1061  		    EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
1062  			goto next_group;
1063  
1064  		ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
1065  		if (!ret2)
1066  			goto next_group;
1067  
1068  		if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
1069  			ext4_error(sb, "reserved inode found cleared - "
1070  				   "inode=%lu", ino + 1);
1071  			ext4_mark_group_bitmap_corrupted(sb, group,
1072  					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
1073  			goto next_group;
1074  		}
1075  
1076  		if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
1077  			BUG_ON(nblocks <= 0);
1078  			handle = __ext4_journal_start_sb(NULL, dir->i_sb,
1079  				 line_no, handle_type, nblocks, 0,
1080  				 ext4_trans_default_revoke_credits(sb));
1081  			if (IS_ERR(handle)) {
1082  				err = PTR_ERR(handle);
1083  				ext4_std_error(sb, err);
1084  				goto out;
1085  			}
1086  		}
1087  		BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
1088  		err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh,
1089  						    EXT4_JTR_NONE);
1090  		if (err) {
1091  			ext4_std_error(sb, err);
1092  			goto out;
1093  		}
1094  		ext4_lock_group(sb, group);
1095  		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
1096  		if (ret2) {
1097  			/* Someone already took the bit. Repeat the search
1098  			 * with lock held.
1099  			 */
1100  			ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
1101  			if (ret2) {
1102  				ext4_set_bit(ino, inode_bitmap_bh->b_data);
1103  				ret2 = 0;
1104  			} else {
1105  				ret2 = 1; /* we didn't grab the inode */
1106  			}
1107  		}
1108  		ext4_unlock_group(sb, group);
1109  		ino++;		/* the inode bitmap is zero-based */
1110  		if (!ret2)
1111  			goto got; /* we grabbed the inode! */
1112  
1113  next_group:
1114  		if (++group == ngroups)
1115  			group = 0;
1116  	}
1117  	err = -ENOSPC;
1118  	goto out;
1119  
1120  got:
1121  	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
1122  	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
1123  	if (err) {
1124  		ext4_std_error(sb, err);
1125  		goto out;
1126  	}
1127  
1128  	BUFFER_TRACE(group_desc_bh, "get_write_access");
1129  	err = ext4_journal_get_write_access(handle, sb, group_desc_bh,
1130  					    EXT4_JTR_NONE);
1131  	if (err) {
1132  		ext4_std_error(sb, err);
1133  		goto out;
1134  	}
1135  
1136  	/* We may have to initialize the block bitmap if it isn't already */
1137  	if (ext4_has_group_desc_csum(sb) &&
1138  	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1139  		struct buffer_head *block_bitmap_bh;
1140  
1141  		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
1142  		if (IS_ERR(block_bitmap_bh)) {
1143  			err = PTR_ERR(block_bitmap_bh);
1144  			goto out;
1145  		}
1146  		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
1147  		err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh,
1148  						    EXT4_JTR_NONE);
1149  		if (err) {
1150  			brelse(block_bitmap_bh);
1151  			ext4_std_error(sb, err);
1152  			goto out;
1153  		}
1154  
1155  		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
1156  		err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
1157  
1158  		/* recheck and clear flag under lock if we still need to */
1159  		ext4_lock_group(sb, group);
1160  		if (ext4_has_group_desc_csum(sb) &&
1161  		    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
1162  			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1163  			ext4_free_group_clusters_set(sb, gdp,
1164  				ext4_free_clusters_after_init(sb, group, gdp));
1165  			ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
1166  			ext4_group_desc_csum_set(sb, group, gdp);
1167  		}
1168  		ext4_unlock_group(sb, group);
1169  		brelse(block_bitmap_bh);
1170  
1171  		if (err) {
1172  			ext4_std_error(sb, err);
1173  			goto out;
1174  		}
1175  	}
1176  
1177  	/* Update the relevant bg descriptor fields */
1178  	if (ext4_has_group_desc_csum(sb)) {
1179  		int free;
1180  		struct ext4_group_info *grp = NULL;
1181  
1182  		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
1183  			grp = ext4_get_group_info(sb, group);
1184  			if (!grp) {
1185  				err = -EFSCORRUPTED;
1186  				goto out;
1187  			}
1188  			down_read(&grp->alloc_sem); /*
1189  						     * protect vs itable
1190  						     * lazyinit
1191  						     */
1192  		}
1193  		ext4_lock_group(sb, group); /* while we modify the bg desc */
1194  		free = EXT4_INODES_PER_GROUP(sb) -
1195  			ext4_itable_unused_count(sb, gdp);
1196  		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
1197  			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
1198  			free = 0;
1199  		}
1200  		/*
1201  		 * Check the relative inode number against the last used
1202  		 * relative inode number in this group. if it is greater
1203  		 * we need to update the bg_itable_unused count
1204  		 */
1205  		if (ino > free)
1206  			ext4_itable_unused_set(sb, gdp,
1207  					(EXT4_INODES_PER_GROUP(sb) - ino));
1208  		if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
1209  			up_read(&grp->alloc_sem);
1210  	} else {
1211  		ext4_lock_group(sb, group);
1212  	}
1213  
1214  	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
1215  	if (S_ISDIR(mode)) {
1216  		ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
1217  		if (sbi->s_log_groups_per_flex) {
1218  			ext4_group_t f = ext4_flex_group(sbi, group);
1219  
1220  			atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
1221  							f)->used_dirs);
1222  		}
1223  	}
1224  	if (ext4_has_group_desc_csum(sb)) {
1225  		ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
1226  		ext4_group_desc_csum_set(sb, group, gdp);
1227  	}
1228  	ext4_unlock_group(sb, group);
1229  
1230  	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
1231  	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
1232  	if (err) {
1233  		ext4_std_error(sb, err);
1234  		goto out;
1235  	}
1236  
1237  	percpu_counter_dec(&sbi->s_freeinodes_counter);
1238  	if (S_ISDIR(mode))
1239  		percpu_counter_inc(&sbi->s_dirs_counter);
1240  
1241  	if (sbi->s_log_groups_per_flex) {
1242  		flex_group = ext4_flex_group(sbi, group);
1243  		atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
1244  						flex_group)->free_inodes);
1245  	}
1246  
1247  	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
1248  	/* This is the optimal IO size (for stat), not the fs block size */
1249  	inode->i_blocks = 0;
1250  	simple_inode_init_ts(inode);
1251  	ei->i_crtime = inode_get_mtime(inode);
1252  
1253  	memset(ei->i_data, 0, sizeof(ei->i_data));
1254  	ei->i_dir_start_lookup = 0;
1255  	ei->i_disksize = 0;
1256  
1257  	/* Don't inherit extent flag from directory, amongst others. */
1258  	ei->i_flags =
1259  		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
1260  	ei->i_flags |= i_flags;
1261  	ei->i_file_acl = 0;
1262  	ei->i_dtime = 0;
1263  	ei->i_block_group = group;
1264  	ei->i_last_alloc_group = ~0;
1265  
1266  	ext4_set_inode_flags(inode, true);
1267  	if (IS_DIRSYNC(inode))
1268  		ext4_handle_sync(handle);
1269  	if (insert_inode_locked(inode) < 0) {
1270  		/*
1271  		 * Likely a bitmap corruption causing inode to be allocated
1272  		 * twice.
1273  		 */
1274  		err = -EIO;
1275  		ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
1276  			   inode->i_ino);
1277  		ext4_mark_group_bitmap_corrupted(sb, group,
1278  					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
1279  		goto out;
1280  	}
1281  	inode->i_generation = get_random_u32();
1282  
1283  	/* Precompute checksum seed for inode metadata */
1284  	if (ext4_has_metadata_csum(sb)) {
1285  		__u32 csum;
1286  		__le32 inum = cpu_to_le32(inode->i_ino);
1287  		__le32 gen = cpu_to_le32(inode->i_generation);
1288  		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
1289  				   sizeof(inum));
1290  		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
1291  					      sizeof(gen));
1292  	}
1293  
1294  	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
1295  	ext4_set_inode_state(inode, EXT4_STATE_NEW);
1296  
1297  	ei->i_extra_isize = sbi->s_want_extra_isize;
1298  	ei->i_inline_off = 0;
1299  	if (ext4_has_feature_inline_data(sb) &&
1300  	    (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
1301  		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
1302  	ret = inode;
1303  	err = dquot_alloc_inode(inode);
1304  	if (err)
1305  		goto fail_drop;
1306  
1307  	/*
1308  	 * Since the encryption xattr will always be unique, create it first so
1309  	 * that it's less likely to end up in an external xattr block and
1310  	 * prevent its deduplication.
1311  	 */
1312  	if (encrypt) {
1313  		err = fscrypt_set_context(inode, handle);
1314  		if (err)
1315  			goto fail_free_drop;
1316  	}
1317  
1318  	if (!(ei->i_flags & EXT4_EA_INODE_FL)) {
1319  		err = ext4_init_acl(handle, inode, dir);
1320  		if (err)
1321  			goto fail_free_drop;
1322  
1323  		err = ext4_init_security(handle, inode, dir, qstr);
1324  		if (err)
1325  			goto fail_free_drop;
1326  	}
1327  
1328  	if (ext4_has_feature_extents(sb)) {
1329  		/* set extent flag only for directory, file and normal symlink*/
1330  		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
1331  			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
1332  			ext4_ext_tree_init(handle, inode);
1333  		}
1334  	}
1335  
1336  	ext4_update_inode_fsync_trans(handle, inode, 1);
1337  
1338  	err = ext4_mark_inode_dirty(handle, inode);
1339  	if (err) {
1340  		ext4_std_error(sb, err);
1341  		goto fail_free_drop;
1342  	}
1343  
1344  	ext4_debug("allocating inode %lu\n", inode->i_ino);
1345  	trace_ext4_allocate_inode(inode, dir, mode);
1346  	brelse(inode_bitmap_bh);
1347  	return ret;
1348  
1349  fail_free_drop:
1350  	dquot_free_inode(inode);
1351  fail_drop:
1352  	clear_nlink(inode);
1353  	unlock_new_inode(inode);
1354  out:
1355  	dquot_drop(inode);
1356  	inode->i_flags |= S_NOQUOTA;
1357  	iput(inode);
1358  	brelse(inode_bitmap_bh);
1359  	return ERR_PTR(err);
1360  }
1361  
1362  /* Verify that we are loading a valid orphan from disk */
ext4_orphan_get(struct super_block * sb,unsigned long ino)1363  struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1364  {
1365  	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
1366  	ext4_group_t block_group;
1367  	int bit;
1368  	struct buffer_head *bitmap_bh = NULL;
1369  	struct inode *inode = NULL;
1370  	int err = -EFSCORRUPTED;
1371  
1372  	if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
1373  		goto bad_orphan;
1374  
1375  	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
1376  	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
1377  	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
1378  	if (IS_ERR(bitmap_bh))
1379  		return ERR_CAST(bitmap_bh);
1380  
1381  	/* Having the inode bit set should be a 100% indicator that this
1382  	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
1383  	 * inodes that were being truncated, so we can't check i_nlink==0.
1384  	 */
1385  	if (!ext4_test_bit(bit, bitmap_bh->b_data))
1386  		goto bad_orphan;
1387  
1388  	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1389  	if (IS_ERR(inode)) {
1390  		err = PTR_ERR(inode);
1391  		ext4_error_err(sb, -err,
1392  			       "couldn't read orphan inode %lu (err %d)",
1393  			       ino, err);
1394  		brelse(bitmap_bh);
1395  		return inode;
1396  	}
1397  
1398  	/*
1399  	 * If the orphans has i_nlinks > 0 then it should be able to
1400  	 * be truncated, otherwise it won't be removed from the orphan
1401  	 * list during processing and an infinite loop will result.
1402  	 * Similarly, it must not be a bad inode.
1403  	 */
1404  	if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
1405  	    is_bad_inode(inode))
1406  		goto bad_orphan;
1407  
1408  	if (NEXT_ORPHAN(inode) > max_ino)
1409  		goto bad_orphan;
1410  	brelse(bitmap_bh);
1411  	return inode;
1412  
1413  bad_orphan:
1414  	ext4_error(sb, "bad orphan inode %lu", ino);
1415  	if (bitmap_bh)
1416  		printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
1417  		       bit, (unsigned long long)bitmap_bh->b_blocknr,
1418  		       ext4_test_bit(bit, bitmap_bh->b_data));
1419  	if (inode) {
1420  		printk(KERN_ERR "is_bad_inode(inode)=%d\n",
1421  		       is_bad_inode(inode));
1422  		printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
1423  		       NEXT_ORPHAN(inode));
1424  		printk(KERN_ERR "max_ino=%lu\n", max_ino);
1425  		printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
1426  		/* Avoid freeing blocks if we got a bad deleted inode */
1427  		if (inode->i_nlink == 0)
1428  			inode->i_blocks = 0;
1429  		iput(inode);
1430  	}
1431  	brelse(bitmap_bh);
1432  	return ERR_PTR(err);
1433  }
1434  
ext4_count_free_inodes(struct super_block * sb)1435  unsigned long ext4_count_free_inodes(struct super_block *sb)
1436  {
1437  	unsigned long desc_count;
1438  	struct ext4_group_desc *gdp;
1439  	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1440  #ifdef EXT4FS_DEBUG
1441  	struct ext4_super_block *es;
1442  	unsigned long bitmap_count, x;
1443  	struct buffer_head *bitmap_bh = NULL;
1444  
1445  	es = EXT4_SB(sb)->s_es;
1446  	desc_count = 0;
1447  	bitmap_count = 0;
1448  	gdp = NULL;
1449  	for (i = 0; i < ngroups; i++) {
1450  		gdp = ext4_get_group_desc(sb, i, NULL);
1451  		if (!gdp)
1452  			continue;
1453  		desc_count += ext4_free_inodes_count(sb, gdp);
1454  		brelse(bitmap_bh);
1455  		bitmap_bh = ext4_read_inode_bitmap(sb, i);
1456  		if (IS_ERR(bitmap_bh)) {
1457  			bitmap_bh = NULL;
1458  			continue;
1459  		}
1460  
1461  		x = ext4_count_free(bitmap_bh->b_data,
1462  				    EXT4_INODES_PER_GROUP(sb) / 8);
1463  		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1464  			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
1465  		bitmap_count += x;
1466  	}
1467  	brelse(bitmap_bh);
1468  	printk(KERN_DEBUG "ext4_count_free_inodes: "
1469  	       "stored = %u, computed = %lu, %lu\n",
1470  	       le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
1471  	return desc_count;
1472  #else
1473  	desc_count = 0;
1474  	for (i = 0; i < ngroups; i++) {
1475  		gdp = ext4_get_group_desc(sb, i, NULL);
1476  		if (!gdp)
1477  			continue;
1478  		desc_count += ext4_free_inodes_count(sb, gdp);
1479  		cond_resched();
1480  	}
1481  	return desc_count;
1482  #endif
1483  }
1484  
1485  /* Called at mount-time, super-block is locked */
ext4_count_dirs(struct super_block * sb)1486  unsigned long ext4_count_dirs(struct super_block * sb)
1487  {
1488  	unsigned long count = 0;
1489  	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1490  
1491  	for (i = 0; i < ngroups; i++) {
1492  		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1493  		if (!gdp)
1494  			continue;
1495  		count += ext4_used_dirs_count(sb, gdp);
1496  	}
1497  	return count;
1498  }
1499  
1500  /*
1501   * Zeroes not yet zeroed inode table - just write zeroes through the whole
1502   * inode table. Must be called without any spinlock held. The only place
1503   * where it is called from on active part of filesystem is ext4lazyinit
1504   * thread, so we do not need any special locks, however we have to prevent
1505   * inode allocation from the current group, so we take alloc_sem lock, to
1506   * block ext4_new_inode() until we are finished.
1507   */
ext4_init_inode_table(struct super_block * sb,ext4_group_t group,int barrier)1508  int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1509  				 int barrier)
1510  {
1511  	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1512  	struct ext4_sb_info *sbi = EXT4_SB(sb);
1513  	struct ext4_group_desc *gdp = NULL;
1514  	struct buffer_head *group_desc_bh;
1515  	handle_t *handle;
1516  	ext4_fsblk_t blk;
1517  	int num, ret = 0, used_blks = 0;
1518  	unsigned long used_inos = 0;
1519  
1520  	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
1521  	if (!gdp || !grp)
1522  		goto out;
1523  
1524  	/*
1525  	 * We do not need to lock this, because we are the only one
1526  	 * handling this flag.
1527  	 */
1528  	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
1529  		goto out;
1530  
1531  	handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
1532  	if (IS_ERR(handle)) {
1533  		ret = PTR_ERR(handle);
1534  		goto out;
1535  	}
1536  
1537  	down_write(&grp->alloc_sem);
1538  	/*
1539  	 * If inode bitmap was already initialized there may be some
1540  	 * used inodes so we need to skip blocks with used inodes in
1541  	 * inode table.
1542  	 */
1543  	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
1544  		used_inos = EXT4_INODES_PER_GROUP(sb) -
1545  			    ext4_itable_unused_count(sb, gdp);
1546  		used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block);
1547  
1548  		/* Bogus inode unused count? */
1549  		if (used_blks < 0 || used_blks > sbi->s_itb_per_group) {
1550  			ext4_error(sb, "Something is wrong with group %u: "
1551  				   "used itable blocks: %d; "
1552  				   "itable unused count: %u",
1553  				   group, used_blks,
1554  				   ext4_itable_unused_count(sb, gdp));
1555  			ret = 1;
1556  			goto err_out;
1557  		}
1558  
1559  		used_inos += group * EXT4_INODES_PER_GROUP(sb);
1560  		/*
1561  		 * Are there some uninitialized inodes in the inode table
1562  		 * before the first normal inode?
1563  		 */
1564  		if ((used_blks != sbi->s_itb_per_group) &&
1565  		     (used_inos < EXT4_FIRST_INO(sb))) {
1566  			ext4_error(sb, "Something is wrong with group %u: "
1567  				   "itable unused count: %u; "
1568  				   "itables initialized count: %ld",
1569  				   group, ext4_itable_unused_count(sb, gdp),
1570  				   used_inos);
1571  			ret = 1;
1572  			goto err_out;
1573  		}
1574  	}
1575  
1576  	blk = ext4_inode_table(sb, gdp) + used_blks;
1577  	num = sbi->s_itb_per_group - used_blks;
1578  
1579  	BUFFER_TRACE(group_desc_bh, "get_write_access");
1580  	ret = ext4_journal_get_write_access(handle, sb, group_desc_bh,
1581  					    EXT4_JTR_NONE);
1582  	if (ret)
1583  		goto err_out;
1584  
1585  	/*
1586  	 * Skip zeroout if the inode table is full. But we set the ZEROED
1587  	 * flag anyway, because obviously, when it is full it does not need
1588  	 * further zeroing.
1589  	 */
1590  	if (unlikely(num == 0))
1591  		goto skip_zeroout;
1592  
1593  	ext4_debug("going to zero out inode table in group %d\n",
1594  		   group);
1595  	ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
1596  	if (ret < 0)
1597  		goto err_out;
1598  	if (barrier)
1599  		blkdev_issue_flush(sb->s_bdev);
1600  
1601  skip_zeroout:
1602  	ext4_lock_group(sb, group);
1603  	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
1604  	ext4_group_desc_csum_set(sb, group, gdp);
1605  	ext4_unlock_group(sb, group);
1606  
1607  	BUFFER_TRACE(group_desc_bh,
1608  		     "call ext4_handle_dirty_metadata");
1609  	ret = ext4_handle_dirty_metadata(handle, NULL,
1610  					 group_desc_bh);
1611  
1612  err_out:
1613  	up_write(&grp->alloc_sem);
1614  	ext4_journal_stop(handle);
1615  out:
1616  	return ret;
1617  }
1618