From 7046ae35329f6d3cd39ae22d96d9525e45e748f2 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 1 Oct 2017 17:57:54 -0400 Subject: ext4: Add iomap support for inline data Report inline data as a IOMAP_F_DATA_INLINE mapping. This allows to use iomap_seek_hole and iomap_seek_data in ext4_llseek and makes switching to iomap_fiemap in ext4_fiemap easier. Signed-off-by: Andreas Gruenbacher Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e2abe01c8c6b..ae3e4a25821a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3048,6 +3048,10 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, extern int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int *has_inline, __u64 start, __u64 len); + +struct iomap; +extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); + extern int ext4_try_to_evict_inline_data(handle_t *handle, struct inode *inode, int needed); -- cgit v1.2.3 From 545052e9e35a34af95d2e870ac3fe2894376e6e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 1 Oct 2017 17:58:54 -0400 Subject: ext4: Switch to iomap for SEEK_HOLE / SEEK_DATA Switch to the iomap_seek_hole and iomap_seek_data helpers for implementing lseek SEEK_HOLE / SEEK_DATA, and remove all the code that isn't needed any more. Note that with this patch ext4 will now always depend on the iomap code instead of only when CONFIG_DAX is enabled, and it requires adding a call into the extent status tree for iomap_begin as well to properly deal with delalloc extents. Signed-off-by: Christoph Hellwig Signed-off-by: Andreas Gruenbacher Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara [More fixes and cleanups by Andreas] --- fs/ext4/Kconfig | 1 + fs/ext4/ext4.h | 3 - fs/ext4/file.c | 263 +++----------------------------------------------------- fs/ext4/inode.c | 109 ++++++++--------------- 4 files changed, 49 insertions(+), 327 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index e38039fd96ff..73b850f5659c 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -37,6 +37,7 @@ config EXT4_FS select CRC16 select CRYPTO select CRYPTO_CRC32C + select FS_IOMAP help This is the next generation of the ext3 filesystem. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ae3e4a25821a..6fd1fe7456eb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2515,9 +2515,6 @@ extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); -extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, - unsigned int map_len, - struct extent_status *result); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index b1da660ac3bc..67daac3b2ab2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -437,248 +438,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp) return dquot_file_open(inode, filp); } -/* - * Here we use ext4_map_blocks() to get a block mapping for a extent-based - * file rather than ext4_ext_walk_space() because we can introduce - * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same - * function. When extent status tree has been fully implemented, it will - * track all extent status for a file and we can directly use it to - * retrieve the offset for SEEK_DATA/SEEK_HOLE. - */ - -/* - * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to - * lookup page cache to check whether or not there has some data between - * [startoff, endoff] because, if this range contains an unwritten extent, - * we determine this extent as a data or a hole according to whether the - * page cache has data or not. - */ -static int ext4_find_unwritten_pgoff(struct inode *inode, - int whence, - ext4_lblk_t end_blk, - loff_t *offset) -{ - struct pagevec pvec; - unsigned int blkbits; - pgoff_t index; - pgoff_t end; - loff_t endoff; - loff_t startoff; - loff_t lastoff; - int found = 0; - - blkbits = inode->i_sb->s_blocksize_bits; - startoff = *offset; - lastoff = startoff; - endoff = (loff_t)end_blk << blkbits; - - index = startoff >> PAGE_SHIFT; - end = (endoff - 1) >> PAGE_SHIFT; - - pagevec_init(&pvec, 0); - do { - int i; - unsigned long nr_pages; - - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, - &index, end); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - /* - * If current offset is smaller than the page offset, - * there is a hole at this offset. - */ - if (whence == SEEK_HOLE && lastoff < endoff && - lastoff < page_offset(pvec.pages[i])) { - found = 1; - *offset = lastoff; - goto out; - } - - lock_page(page); - - if (unlikely(page->mapping != inode->i_mapping)) { - unlock_page(page); - continue; - } - - if (!page_has_buffers(page)) { - unlock_page(page); - continue; - } - - if (page_has_buffers(page)) { - lastoff = page_offset(page); - bh = head = page_buffers(page); - do { - if (lastoff + bh->b_size <= startoff) - goto next; - if (buffer_uptodate(bh) || - buffer_unwritten(bh)) { - if (whence == SEEK_DATA) - found = 1; - } else { - if (whence == SEEK_HOLE) - found = 1; - } - if (found) { - *offset = max_t(loff_t, - startoff, lastoff); - unlock_page(page); - goto out; - } -next: - lastoff += bh->b_size; - bh = bh->b_this_page; - } while (bh != head); - } - - lastoff = page_offset(page) + PAGE_SIZE; - unlock_page(page); - } - - pagevec_release(&pvec); - } while (index <= end); - - /* There are no pages upto endoff - that would be a hole in there. */ - if (whence == SEEK_HOLE && lastoff < endoff) { - found = 1; - *offset = lastoff; - } -out: - pagevec_release(&pvec); - return found; -} - -/* - * ext4_seek_data() retrieves the offset for SEEK_DATA. - */ -static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) -{ - struct inode *inode = file->f_mapping->host; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t dataoff, isize; - int blkbits; - int ret; - - inode_lock(inode); - - isize = i_size_read(inode); - if (offset < 0 || offset >= isize) { - inode_unlock(inode); - return -ENXIO; - } - - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - dataoff = offset; - - do { - ret = ext4_get_next_extent(inode, last, end - last + 1, &es); - if (ret <= 0) { - /* No extent found -> no data */ - if (ret == 0) - ret = -ENXIO; - inode_unlock(inode); - return ret; - } - - last = es.es_lblk; - if (last != start) - dataoff = (loff_t)last << blkbits; - if (!ext4_es_is_unwritten(&es)) - break; - - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, - es.es_lblk + es.es_len, &dataoff)) - break; - last += es.es_len; - dataoff = (loff_t)last << blkbits; - cond_resched(); - } while (last <= end); - - inode_unlock(inode); - - if (dataoff > isize) - return -ENXIO; - - return vfs_setpos(file, dataoff, maxsize); -} - -/* - * ext4_seek_hole() retrieves the offset for SEEK_HOLE. - */ -static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) -{ - struct inode *inode = file->f_mapping->host; - struct extent_status es; - ext4_lblk_t start, last, end; - loff_t holeoff, isize; - int blkbits; - int ret; - - inode_lock(inode); - - isize = i_size_read(inode); - if (offset < 0 || offset >= isize) { - inode_unlock(inode); - return -ENXIO; - } - - blkbits = inode->i_sb->s_blocksize_bits; - start = offset >> blkbits; - last = start; - end = isize >> blkbits; - holeoff = offset; - - do { - ret = ext4_get_next_extent(inode, last, end - last + 1, &es); - if (ret < 0) { - inode_unlock(inode); - return ret; - } - /* Found a hole? */ - if (ret == 0 || es.es_lblk > last) { - if (last != start) - holeoff = (loff_t)last << blkbits; - break; - } - /* - * If there is a unwritten extent at this offset, - * it will be as a data or a hole according to page - * cache that has data or not. - */ - if (ext4_es_is_unwritten(&es) && - ext4_find_unwritten_pgoff(inode, SEEK_HOLE, - last + es.es_len, &holeoff)) - break; - - last += es.es_len; - holeoff = (loff_t)last << blkbits; - cond_resched(); - } while (last <= end); - - inode_unlock(inode); - - if (holeoff > isize) - holeoff = isize; - - return vfs_setpos(file, holeoff, maxsize); -} - /* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes @@ -695,18 +454,24 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) maxbytes = inode->i_sb->s_maxbytes; switch (whence) { - case SEEK_SET: - case SEEK_CUR: - case SEEK_END: + default: return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); - case SEEK_DATA: - return ext4_seek_data(file, offset, maxbytes); case SEEK_HOLE: - return ext4_seek_hole(file, offset, maxbytes); + inode_lock_shared(inode); + offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); + inode_unlock_shared(inode); + break; + case SEEK_DATA: + inode_lock_shared(inode); + offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); + inode_unlock_shared(inode); + break; } - return -EINVAL; + if (offset < 0) + return offset; + return vfs_setpos(file, offset, maxbytes); } const struct file_operations ext4_file_operations = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7755f41bdfc3..edfe95f81274 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3393,7 +3393,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } -#ifdef CONFIG_FS_DAX static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { @@ -3402,6 +3401,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned long first_block = offset >> blkbits; unsigned long last_block = (offset + length - 1) >> blkbits; struct ext4_map_blocks map; + bool delalloc = false; int ret; @@ -3422,9 +3422,33 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_lblk = first_block; map.m_len = last_block - first_block + 1; - if (!(flags & IOMAP_WRITE)) { + if (flags & IOMAP_REPORT) { ret = ext4_map_blocks(NULL, inode, &map, 0); - } else { + if (ret < 0) + return ret; + + if (ret == 0) { + ext4_lblk_t end = map.m_lblk + map.m_len - 1; + struct extent_status es; + + ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es); + + if (!es.es_len || es.es_lblk > end) { + /* entire range is a hole */ + } else if (es.es_lblk > map.m_lblk) { + /* range starts with a hole */ + map.m_len = es.es_lblk - map.m_lblk; + } else { + ext4_lblk_t offs = 0; + + if (es.es_lblk < map.m_lblk) + offs = map.m_lblk - es.es_lblk; + map.m_lblk = es.es_lblk + offs; + map.m_len = es.es_len - offs; + delalloc = true; + } + } + } else if (flags & IOMAP_WRITE) { int dio_credits; handle_t *handle; int retries = 0; @@ -3475,17 +3499,21 @@ retry: } } ext4_journal_stop(handle); + } else { + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; } iomap->flags = 0; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = sbi->s_daxdev; iomap->offset = first_block << blkbits; + iomap->length = (u64)map.m_len << blkbits; if (ret == 0) { - iomap->type = IOMAP_HOLE; + iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; - iomap->length = (u64)map.m_len << blkbits; } else { if (map.m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; @@ -3496,11 +3524,11 @@ retry: return -EIO; } iomap->addr = (u64)map.m_pblk << blkbits; - iomap->length = (u64)map.m_len << blkbits; } if (map.m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; + return 0; } @@ -3561,8 +3589,6 @@ const struct iomap_ops ext4_iomap_ops = { .iomap_end = ext4_iomap_end, }; -#endif - static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { @@ -6118,70 +6144,3 @@ int ext4_filemap_fault(struct vm_fault *vmf) return err; } - -/* - * Find the first extent at or after @lblk in an inode that is not a hole. - * Search for @map_len blocks at most. The extent is returned in @result. - * - * The function returns 1 if we found an extent. The function returns 0 in - * case there is no extent at or after @lblk and in that case also sets - * @result->es_len to 0. In case of error, the error code is returned. - */ -int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, - unsigned int map_len, struct extent_status *result) -{ - struct ext4_map_blocks map; - struct extent_status es = {}; - int ret; - - map.m_lblk = lblk; - map.m_len = map_len; - - /* - * For non-extent based files this loop may iterate several times since - * we do not determine full hole size. - */ - while (map.m_len > 0) { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - /* There's extent covering m_lblk? Just return it. */ - if (ret > 0) { - int status; - - ext4_es_store_pblock(result, map.m_pblk); - result->es_lblk = map.m_lblk; - result->es_len = map.m_len; - if (map.m_flags & EXT4_MAP_UNWRITTEN) - status = EXTENT_STATUS_UNWRITTEN; - else - status = EXTENT_STATUS_WRITTEN; - ext4_es_store_status(result, status); - return 1; - } - ext4_es_find_delayed_extent_range(inode, map.m_lblk, - map.m_lblk + map.m_len - 1, - &es); - /* Is delalloc data before next block in extent tree? */ - if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) { - ext4_lblk_t offset = 0; - - if (es.es_lblk < lblk) - offset = lblk - es.es_lblk; - result->es_lblk = es.es_lblk + offset; - ext4_es_store_pblock(result, - ext4_es_pblock(&es) + offset); - result->es_len = es.es_len - offset; - ext4_es_store_status(result, ext4_es_status(&es)); - - return 1; - } - /* There's a hole at m_lblk, advance us after it */ - map.m_lblk += map.m_len; - map_len -= map.m_len; - map.m_len = map_len; - cond_resched(); - } - result->es_len = 0; - return 0; -} -- cgit v1.2.3 From 8058cac6a1d5dc8a2e309fafbfa341fd43d54528 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 12 Oct 2017 12:09:48 -0400 Subject: ext4: remove duplicate extended attributes defs The following commit: commit 9b7365fc1c82 ("ext4: add FS_IOC_FSSETXATTR/FS_IOC_FSGETXATTR interface support") added several defines related to extended attributes to ext4.h. They were added within an #ifndef FS_IOC_FSGETXATTR block with the comment: /* Until the uapi changes get merged for project quota... */ Those uapi changes were merged by this commit: commit 334e580a6f97 ("fs: XFS_IOC_FS[SG]SETXATTR to FS_IOC_FS[SG]ETXATTR promotion") so all the definitions needed by ext4 are available in include/uapi/linux/fs.h. Remove the duplicates from ext4.h. Signed-off-by: Ross Zwisler Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 37 ------------------------------------- 1 file changed, 37 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6fd1fe7456eb..16bf888b187e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -644,43 +644,6 @@ enum { #define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY -#ifndef FS_IOC_FSGETXATTR -/* Until the uapi changes get merged for project quota... */ - -#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) -#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) - -/* - * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. - */ -struct fsxattr { - __u32 fsx_xflags; /* xflags field value (get/set) */ - __u32 fsx_extsize; /* extsize field value (get/set)*/ - __u32 fsx_nextents; /* nextents field value (get) */ - __u32 fsx_projid; /* project identifier (get/set) */ - unsigned char fsx_pad[12]; -}; - -/* - * Flags for the fsx_xflags field - */ -#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ -#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ -#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ -#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ -#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ -#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ -#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ -#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ -#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ -#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ -#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ -#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ -#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ -#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ -#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ -#endif /* !defined(FS_IOC_FSGETXATTR) */ - #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR -- cgit v1.2.3 From d77147ff443b255d82c907a632c825b2cc610b10 Mon Sep 17 00:00:00 2001 From: harshads Date: Sun, 29 Oct 2017 09:38:46 -0400 Subject: ext4: add support for online resizing with bigalloc This patch adds support for online resizing on bigalloc file system by implementing EXT4_IOC_RESIZE_FS ioctl. Old resize interfaces (add block groups and extend last block group) are left untouched. Tests performed with cluster sizes of 1, 2, 4 and 8 blocks (of size 4k) per cluster. I will add these tests to xfstests. Signed-off-by: Harshad Shirwadkar Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +-- fs/ext4/ioctl.c | 6 ---- fs/ext4/mballoc.c | 28 ++++++++------- fs/ext4/resize.c | 104 ++++++++++++++++++++++++++++++++++-------------------- 4 files changed, 84 insertions(+), 58 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 16bf888b187e..53ce95b52fd8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -545,8 +545,8 @@ struct ext4_new_group_data { __u64 inode_table; __u32 blocks_count; __u16 reserved_blocks; - __u16 unused; - __u32 free_blocks_count; + __u16 mdata_blocks; + __u32 free_clusters_count; }; /* Indexes used to index group tables in ext4_new_group_data */ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index b0b754b37c36..144bbda2b808 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -871,12 +871,6 @@ group_add_out: int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not (yet) supported with bigalloc"); - return -EOPNOTSUPP; - } - if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, sizeof(__u64))) { return -EFAULT; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 701085620cd8..d9f8b90a93ed 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4994,8 +4994,11 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; - int err = 0, ret, blk_free_count; - ext4_grpblk_t blocks_freed; + int err = 0, ret, free_clusters_count; + ext4_grpblk_t clusters_freed; + ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); + ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); + unsigned long cluster_count = last_cluster - first_cluster + 1; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); @@ -5007,8 +5010,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - ext4_warning(sb, "too much blocks added to group %u", + if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { + ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; goto error_return; @@ -5054,14 +5057,14 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (err) goto error_return; - for (i = 0, blocks_freed = 0; i < count; i++) { + for (i = 0, clusters_freed = 0; i < cluster_count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { ext4_error(sb, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { - blocks_freed++; + clusters_freed++; } } @@ -5075,19 +5078,20 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * them with group lock_held */ ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); - mb_free_blocks(NULL, &e4b, bit, count); - blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); - ext4_free_group_clusters_set(sb, desc, blk_free_count); + mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); + mb_free_blocks(NULL, &e4b, bit, cluster_count); + free_clusters_count = clusters_freed + + ext4_free_group_clusters(sb, desc); + ext4_free_group_clusters_set(sb, desc, free_clusters_count); ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_NUM_B2C(sbi, blocks_freed)); + clusters_freed); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + atomic64_add(clusters_freed, &sbi->s_flex_groups[flex_group].free_clusters); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 035cd3f4785e..be22d7b425e9 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -106,7 +106,7 @@ static int verify_group_input(struct super_block *sb, overhead = ext4_group_overhead_blocks(sb, group); metaend = start + overhead; - input->free_blocks_count = free_blocks_count = + input->free_clusters_count = free_blocks_count = input->blocks_count - 2 - overhead - sbi->s_itb_per_group; if (test_opt(sb, DEBUG)) @@ -257,6 +257,7 @@ static int ext4_alloc_group_tables(struct super_block *sb, ext4_group_t last_group; unsigned overhead; __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0; + int i; BUG_ON(flex_gd->count == 0 || group_data == NULL); @@ -293,7 +294,7 @@ next_group: group_data[bb_index].block_bitmap = start_blk++; group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; - group_data[group].free_blocks_count--; + group_data[group].mdata_blocks++; flex_gd->bg_flags[group] &= uninit_mask; } @@ -304,7 +305,7 @@ next_group: group_data[ib_index].inode_bitmap = start_blk++; group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; - group_data[group].free_blocks_count--; + group_data[group].mdata_blocks++; flex_gd->bg_flags[group] &= uninit_mask; } @@ -323,15 +324,22 @@ next_group: if (start_blk + itb > next_group_start) { flex_gd->bg_flags[group + 1] &= uninit_mask; overhead = start_blk + itb - next_group_start; - group_data[group + 1].free_blocks_count -= overhead; + group_data[group + 1].mdata_blocks += overhead; itb -= overhead; } - group_data[group].free_blocks_count -= itb; + group_data[group].mdata_blocks += itb; flex_gd->bg_flags[group] &= uninit_mask; start_blk += EXT4_SB(sb)->s_itb_per_group; } + /* Update free clusters count to exclude metadata blocks */ + for (i = 0; i < flex_gd->count; i++) { + group_data[i].free_clusters_count -= + EXT4_NUM_B2C(EXT4_SB(sb), + group_data[i].mdata_blocks); + } + if (test_opt(sb, DEBUG)) { int i; group = group_data[0].group; @@ -341,12 +349,13 @@ next_group: flexbg_size); for (i = 0; i < flex_gd->count; i++) { - printk(KERN_DEBUG "adding %s group %u: %u " - "blocks (%d free)\n", + ext4_debug( + "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n", ext4_bg_has_super(sb, group + i) ? "normal" : "no-super", group + i, group_data[i].blocks_count, - group_data[i].free_blocks_count); + group_data[i].free_clusters_count, + group_data[i].mdata_blocks); } } return 0; @@ -398,7 +407,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh) } /* - * set_flexbg_block_bitmap() mark @count blocks starting from @block used. + * set_flexbg_block_bitmap() mark clusters [@first_cluster, @last_cluster] used. * * Helper function for ext4_setup_new_group_blocks() which set . * @@ -408,22 +417,26 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh) */ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, struct ext4_new_flex_group_data *flex_gd, - ext4_fsblk_t block, ext4_group_t count) + ext4_fsblk_t first_cluster, ext4_fsblk_t last_cluster) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t count = last_cluster - first_cluster + 1; ext4_group_t count2; - ext4_debug("mark blocks [%llu/%u] used\n", block, count); - for (count2 = count; count > 0; count -= count2, block += count2) { + ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster, + last_cluster); + for (count2 = count; count > 0; + count -= count2, first_cluster += count2) { ext4_fsblk_t start; struct buffer_head *bh; ext4_group_t group; int err; - group = ext4_get_group_number(sb, block); - start = ext4_group_first_block_no(sb, group); + group = ext4_get_group_number(sb, EXT4_C2B(sbi, first_cluster)); + start = EXT4_B2C(sbi, ext4_group_first_block_no(sb, group)); group -= flex_gd->groups[0].group; - count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start); + count2 = EXT4_CLUSTERS_PER_GROUP(sb) - (first_cluster - start); if (count2 > count) count2 = count; @@ -444,9 +457,9 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, err = ext4_journal_get_write_access(handle, bh); if (err) return err; - ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block, - block - start, count2); - ext4_set_bits(bh->b_data, block - start, count2); + ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", + first_cluster, first_cluster - start, count2); + ext4_set_bits(bh->b_data, first_cluster - start, count2); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (unlikely(err)) @@ -595,9 +608,10 @@ handle_bb: if (overhead != 0) { ext4_debug("mark backup superblock %#04llx (+0)\n", start); - ext4_set_bits(bh->b_data, 0, overhead); + ext4_set_bits(bh->b_data, 0, + EXT4_NUM_B2C(sbi, overhead)); } - ext4_mark_bitmap_end(group_data[i].blocks_count, + ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count), sb->s_blocksize * 8, bh->b_data); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) @@ -642,7 +656,11 @@ handle_ib: continue; } err = set_flexbg_block_bitmap(sb, handle, - flex_gd, start, count); + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); if (err) goto out; count = group_table_count[j]; @@ -652,7 +670,11 @@ handle_ib: if (count) { err = set_flexbg_block_bitmap(sb, handle, - flex_gd, start, count); + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); if (err) goto out; } @@ -840,7 +862,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_std_error(sb, err); goto exit_inode; } - inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> + (9 - EXT4_SB(sb)->s_cluster_bits); ext4_mark_iloc_dirty(handle, inode, &iloc); memset(gdb_bh->b_data, 0, sb->s_blocksize); err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); @@ -935,6 +958,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, { struct super_block *sb = inode->i_sb; int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + int cluster_bits = EXT4_SB(sb)->s_cluster_bits; struct buffer_head **primary; struct buffer_head *dind; struct ext4_iloc iloc; @@ -1010,7 +1034,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, if (!err) err = err2; } - inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + + inode->i_blocks += reserved_gdb * sb->s_blocksize >> (9 - cluster_bits); ext4_mark_iloc_dirty(handle, inode, &iloc); exit_bh: @@ -1244,7 +1269,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, ext4_group_t group; __u16 *bg_flags = flex_gd->bg_flags; int i, gdb_off, gdb_num, err = 0; - + for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { group = group_data->group; @@ -1271,7 +1296,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, ext4_inode_table_set(sb, gdp, group_data->inode_table); ext4_free_group_clusters_set(sb, gdp, - EXT4_NUM_B2C(sbi, group_data->free_blocks_count)); + group_data->free_clusters_count); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); if (ext4_has_group_desc_csum(sb)) ext4_itable_unused_set(sb, gdp, @@ -1327,7 +1352,7 @@ static void ext4_update_super(struct super_block *sb, */ for (i = 0; i < flex_gd->count; i++) { blocks_count += group_data[i].blocks_count; - free_blocks += group_data[i].free_blocks_count; + free_blocks += EXT4_C2B(sbi, group_data[i].free_clusters_count); } reserved_blocks = ext4_r_blocks_count(es) * 100; @@ -1499,17 +1524,18 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, ext4_fsblk_t n_blocks_count, unsigned long flexbg_size) { - struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; struct ext4_new_group_data *group_data = flex_gd->groups; ext4_fsblk_t o_blocks_count; ext4_group_t n_group; ext4_group_t group; ext4_group_t last_group; ext4_grpblk_t last; - ext4_grpblk_t blocks_per_group; + ext4_grpblk_t clusters_per_group; unsigned long i; - blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb); + clusters_per_group = EXT4_CLUSTERS_PER_GROUP(sb); o_blocks_count = ext4_blocks_count(es); @@ -1530,9 +1556,10 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, int overhead; group_data[i].group = group + i; - group_data[i].blocks_count = blocks_per_group; + group_data[i].blocks_count = EXT4_BLOCKS_PER_GROUP(sb); overhead = ext4_group_overhead_blocks(sb, group + i); - group_data[i].free_blocks_count = blocks_per_group - overhead; + group_data[i].mdata_blocks = overhead; + group_data[i].free_clusters_count = EXT4_CLUSTERS_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) { flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | EXT4_BG_INODE_UNINIT; @@ -1546,10 +1573,10 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, /* We need to initialize block bitmap of last group. */ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; - if ((last_group == n_group) && (last != blocks_per_group - 1)) { - group_data[i - 1].blocks_count = last + 1; - group_data[i - 1].free_blocks_count -= blocks_per_group- - last - 1; + if ((last_group == n_group) && (last != clusters_per_group - 1)) { + group_data[i - 1].blocks_count = EXT4_C2B(sbi, last + 1); + group_data[i - 1].free_clusters_count -= clusters_per_group - + last - 1; } return 1; @@ -1796,7 +1823,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) } /* Do a quick sanity check of the resize inode */ - if (inode->i_blocks != 1 << (inode->i_blkbits - 9)) + if (inode->i_blocks != 1 << (inode->i_blkbits - + (9 - sbi->s_cluster_bits))) goto invalid_resize_inode; for (i = 0; i < EXT4_N_BLOCKS; i++) { if (i == EXT4_DIND_BLOCK) { @@ -1959,7 +1987,7 @@ retry: if (n_group == o_group) add = n_blocks_count - o_blocks_count; else - add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1); + add = EXT4_C2B(sbi, EXT4_CLUSTERS_PER_GROUP(sb) - (offset + 1)); if (add > 0) { err = ext4_group_extend_no_check(sb, o_blocks_count, add); if (err) -- cgit v1.2.3 From 232530680290ba94ca37852ab10d9556ea28badf Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 8 Nov 2017 22:23:20 -0500 Subject: ext4: improve smp scalability for inode generation ->s_next_generation is protected by s_next_gen_lock but its usage pattern is very primitive. We don't actually need sequentially increasing new generation numbers, so let's use prandom_u32() instead. Reported-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 -- fs/ext4/ialloc.c | 4 +--- fs/ext4/ioctl.c | 8 +++----- fs/ext4/super.c | 2 -- 4 files changed, 4 insertions(+), 12 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 53ce95b52fd8..5e6d7b6f50c7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1355,8 +1355,6 @@ struct ext4_sb_info { int s_first_ino; unsigned int s_inode_readahead_blks; unsigned int s_inode_goal; - spinlock_t s_next_gen_lock; - u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ee823022aa34..da79eb5dba40 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1138,9 +1138,7 @@ got: inode->i_ino); goto out; } - spin_lock(&sbi->s_next_gen_lock); - inode->i_generation = sbi->s_next_generation++; - spin_unlock(&sbi->s_next_gen_lock); + inode->i_generation = prandom_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 144bbda2b808..23a4766f6678 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -98,7 +99,6 @@ static long swap_inode_boot_loader(struct super_block *sb, int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; - struct ext4_sb_info *sbi = EXT4_SB(sb); if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) return -EINVAL; @@ -157,10 +157,8 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_ctime = inode_bl->i_ctime = current_time(inode); - spin_lock(&sbi->s_next_gen_lock); - inode->i_generation = sbi->s_next_generation++; - inode_bl->i_generation = sbi->s_next_generation++; - spin_unlock(&sbi->s_next_gen_lock); + inode->i_generation = prandom_u32(); + inode_bl->i_generation = prandom_u32(); ext4_discard_preallocations(inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3a278faf5868..9f2e3eb5131f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3982,8 +3982,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_gdb_count = db_count; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); - spin_lock_init(&sbi->s_next_gen_lock); timer_setup(&sbi->s_err_report, print_daily_error_info, 0); -- cgit v1.2.3