diff options
author | Gao Xiang <hsiangkao@linux.alibaba.com> | 2024-08-30 11:28:37 +0800 |
---|---|---|
committer | Gao Xiang <hsiangkao@linux.alibaba.com> | 2024-09-10 15:26:35 +0800 |
commit | fb176750266a3d7f42ebdcf28e8ba40350b27847 (patch) | |
tree | 89ed0807853bd35f09db1376d76efde8f533c8ee /fs/erofs/super.c | |
parent | 9e2f9d34dd12e6e5b244ec488bcebd0c2d566c50 (diff) |
erofs: add file-backed mount support
It actually has been around for years: For containers and other sandbox
use cases, there will be thousands (and even more) of authenticated
(sub)images running on the same host, unlike OS images.
Of course, all scenarios can use the same EROFS on-disk format, but
bdev-backed mounts just work well for OS images since golden data is
dumped into real block devices. However, it's somewhat hard for
container runtimes to manage and isolate so many unnecessary virtual
block devices safely and efficiently [1]: they just look like a burden
to orchestrators and file-backed mounts are preferred indeed. There
were already enough attempts such as Incremental FS, the original
ComposeFS and PuzzleFS acting in the same way for immutable fses. As
for current EROFS users, ComposeFS, containerd and Android APEXs will
be directly benefited from it.
On the other hand, previous experimental feature "erofs over fscache"
was once also intended to provide a similar solution (inspired by
Incremental FS discussion [2]), but the following facts show file-backed
mounts will be a better approach:
- Fscache infrastructure has recently been moved into new Netfslib
which is an unexpected dependency to EROFS really, although it
originally claims "it could be used for caching other things such as
ISO9660 filesystems too." [3]
- It takes an unexpectedly long time to upstream Fscache/Cachefiles
enhancements. For example, the failover feature took more than
one year, and the deamonless feature is still far behind now;
- Ongoing HSM "fanotify pre-content hooks" [4] together with this will
perfectly supersede "erofs over fscache" in a simpler way since
developers (mainly containerd folks) could leverage their existing
caching mechanism entirely in userspace instead of strictly following
the predefined in-kernel caching tree hierarchy.
After "fanotify pre-content hooks" lands upstream to provide the same
functionality, "erofs over fscache" will be removed then (as an EROFS
internal improvement and EROFS will not have to bother with on-demand
fetching and/or caching improvements anymore.)
[1] https://github.com/containers/storage/pull/2039
[2] https://lore.kernel.org/r/CAOQ4uxjbVxnubaPjVaGYiSwoGDTdpWbB=w_AeM6YM=zVixsUfQ@mail.gmail.com
[3] https://docs.kernel.org/filesystems/caching/fscache.html
[4] https://lore.kernel.org/r/cover.1723670362.git.josef@toxicpanda.com
Closes: https://github.com/containers/composefs/issues/144
Reviewed-by: Sandeep Dhavale <dhavale@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240830032840.3783206-1-hsiangkao@linux.alibaba.com
Diffstat (limited to 'fs/erofs/super.c')
-rw-r--r-- | fs/erofs/super.c | 76 |
1 files changed, 49 insertions, 27 deletions
diff --git a/fs/erofs/super.c b/fs/erofs/super.c index aae3fd15899a..9a7e67eceed4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -10,6 +10,7 @@ #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/exportfs.h> +#include <linux/backing-dev.h> #include "xattr.h" #define CREATE_TRACE_POINTS @@ -161,7 +162,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; - struct file *bdev_file; + struct file *file; dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP); if (IS_ERR(dis)) @@ -183,13 +184,17 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ, - sb->s_type, NULL); - if (IS_ERR(bdev_file)) - return PTR_ERR(bdev_file); - dif->bdev_file = bdev_file; - dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file), - &dif->dax_part_off, NULL, NULL); + file = erofs_is_fileio_mode(sbi) ? + filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) : + bdev_file_open_by_path(dif->path, + BLK_OPEN_READ, sb->s_type, NULL); + if (IS_ERR(file)) + return PTR_ERR(file); + + dif->file = file; + if (!erofs_is_fileio_mode(sbi)) + dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), + &dif->dax_part_off, NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); @@ -566,15 +571,16 @@ static void erofs_set_sysfs_name(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (erofs_is_fscache_mode(sb)) { - if (sbi->domain_id) - super_set_sysfs_name_generic(sb, "%s,%s",sbi->domain_id, - sbi->fsid); - else - super_set_sysfs_name_generic(sb, "%s", sbi->fsid); - return; - } - super_set_sysfs_name_id(sb); + if (sbi->domain_id) + super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id, + sbi->fsid); + else if (sbi->fsid) + super_set_sysfs_name_generic(sb, "%s", sbi->fsid); + else if (erofs_is_fileio_mode(sbi)) + super_set_sysfs_name_generic(sb, "%s", + bdi_dev_name(sb->s_bdi)); + else + super_set_sysfs_name_id(sb); } static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) @@ -589,14 +595,15 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &erofs_sops; sbi->blkszbits = PAGE_SHIFT; - if (erofs_is_fscache_mode(sb)) { + if (!sb->s_bdev) { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - err = erofs_fscache_register_fs(sb); - if (err) - return err; - + if (erofs_is_fscache_mode(sb)) { + err = erofs_fscache_register_fs(sb); + if (err) + return err; + } err = super_setup_bdi(sb); if (err) return err; @@ -693,11 +700,24 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; + int ret; if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); - return get_tree_bdev(fc, erofs_fc_fill_super); + ret = get_tree_bdev(fc, erofs_fc_fill_super); +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (ret == -ENOTBLK) { + if (!fc->source) + return invalf(fc, "No source specified"); + sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(sbi->fdev)) + return PTR_ERR(sbi->fdev); + + return get_tree_nodev(fc, erofs_fc_fill_super); + } +#endif + return ret; } static int erofs_fc_reconfigure(struct fs_context *fc) @@ -727,8 +747,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); - if (dif->bdev_file) - fput(dif->bdev_file); + if (dif->file) + fput(dif->file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -791,7 +811,7 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) + if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev) kill_anon_super(sb); else kill_block_super(sb); @@ -801,6 +821,8 @@ static void erofs_kill_sb(struct super_block *sb) erofs_fscache_unregister_fs(sb); kfree(sbi->fsid); kfree(sbi->domain_id); + if (sbi->fdev) + fput(sbi->fdev); kfree(sbi); sb->s_fs_info = NULL; } @@ -903,7 +925,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EROFS_NAME_LEN; if (uuid_is_null(&sb->s_uuid)) - buf->f_fsid = u64_to_fsid(erofs_is_fscache_mode(sb) ? 0 : + buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 : huge_encode_dev(sb->s_bdev->bd_dev)); else buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); |