diff options
author | Jakub Kicinski <kuba@kernel.org> | 2024-03-28 17:24:10 -0700 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2024-03-28 17:25:57 -0700 |
commit | 5e47fbe5cefe5d25d1fa4481c1b9fbe602b4a69f (patch) | |
tree | b86edc39098cca1d0e53e46dceec6ca856183642 /fs | |
parent | 2a702c2e57908e7bb5c814afeac577a14815c2f2 (diff) | |
parent | 50108c352db70405b3d71d8099d0b3adc3b3352c (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR.
No conflicts, or adjacent changes.
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'fs')
39 files changed, 609 insertions, 372 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 360a5304ec03..b01b1bbf2493 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -344,17 +344,21 @@ void v9fs_evict_inode(struct inode *inode) struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); __le32 __maybe_unused version; - truncate_inode_pages_final(&inode->i_data); + if (!is_bad_inode(inode)) { + truncate_inode_pages_final(&inode->i_data); - version = cpu_to_le32(v9inode->qid.version); - netfs_clear_inode_writeback(inode, &version); + version = cpu_to_le32(v9inode->qid.version); + netfs_clear_inode_writeback(inode, &version); - clear_inode(inode); - filemap_fdatawrite(&inode->i_data); + clear_inode(inode); + filemap_fdatawrite(&inode->i_data); #ifdef CONFIG_9P_FSCACHE - fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); + if (v9fs_inode_cookie(v9inode)) + fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); #endif + } else + clear_inode(inode); } struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index ef9db3e03506..55dde186041a 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -78,11 +78,11 @@ struct inode *v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid) retval = v9fs_init_inode(v9ses, inode, &fid->qid, st->st_mode, new_decode_dev(st->st_rdev)); + v9fs_stat2inode_dotl(st, inode, 0); kfree(st); if (retval) goto error; - v9fs_stat2inode_dotl(st, inode, 0); v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); @@ -297,7 +297,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, umode_t omode) { int err; - struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; kgid_t gid; const unsigned char *name; @@ -307,7 +306,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); - v9ses = v9fs_inode2v9ses(dir); omode |= S_IFDIR; if (dir->i_mode & S_ISGID) @@ -739,7 +737,6 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, kgid_t gid; const unsigned char *name; umode_t mode; - struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; struct p9_qid qid; @@ -749,7 +746,6 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); - v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 1920ed69279b..3314249e8674 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1359,7 +1359,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); - strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + get_task_comm(psinfo->pr_fname, p); return 0; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5f7587ca1ca7..1e09aeea69c2 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1559,7 +1559,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * needing to allocate extents from the block group. */ used = btrfs_space_info_used(space_info, true); - if (space_info->total_bytes - block_group->length < used) { + if (space_info->total_bytes - block_group->length < used && + block_group->zone_unusable < block_group->length) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7441245b1ceb..61594eaf1f89 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4333,6 +4333,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) goto done; + /* + * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above + * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have + * started and finished reading the same eb. In this case, UPTODATE + * will now be set, and we shouldn't read it in again. + */ + if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { + clear_bit(EXTENT_BUFFER_READING, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + return 0; + } + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; check_buffer_tree_ref(eb); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 347ca13d15a9..445f7716f1e2 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -309,7 +309,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) btrfs_warn(fs_info, "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), - start, len, gen); + start, start + len, gen); ret = -ENOENT; goto out; } @@ -318,7 +318,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) btrfs_warn(fs_info, "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), - em->start, start, len, gen); + em->start, start, start + len, gen); ret = -EUCLEAN; goto out; } @@ -340,9 +340,9 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) em->mod_len = em->len; } - free_extent_map(em); out: write_unlock(&tree->lock); + free_extent_map(em); return ret; } @@ -629,13 +629,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, */ ret = merge_extent_mapping(em_tree, existing, em, start); - if (ret) { + if (WARN_ON(ret)) { free_extent_map(em); *em_in = NULL; - WARN_ONCE(ret, -"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu\n", - existing->start, existing->len, - orig_start, orig_len, start); + btrfs_warn(fs_info, +"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", + existing->start, extent_map_end(existing), + orig_start, orig_start + orig_len, start); } free_extent_map(existing); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c4bd0e60db59..fa25004ab04e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2812,7 +2812,17 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, gen = btrfs_get_last_trans_committed(fs_info); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); + ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr); + if (ret == -ENOENT) + break; + + if (ret) { + spin_lock(&sctx->stat_lock); + sctx->stat.super_errors++; + spin_unlock(&sctx->stat_lock); + continue; + } + if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->commit_total_bytes) break; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1dc1f1946ae0..f15591f3e54f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -692,6 +692,16 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, device->bdev = file_bdev(bdev_file); clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + if (device->devt != device->bdev->bd_dev) { + btrfs_warn(NULL, + "device %s maj:min changed from %d:%d to %d:%d", + device->name->str, MAJOR(device->devt), + MINOR(device->devt), MAJOR(device->bdev->bd_dev), + MINOR(device->bdev->bd_dev)); + + device->devt = device->bdev->bd_dev; + } + fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1174,23 +1184,30 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; + int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { - int ret; + int ret2; - ret = btrfs_open_one_device(fs_devices, device, flags, holder); - if (ret == 0 && + ret2 = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret2 == 0 && (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; - } else if (ret == -ENODATA) { + } else if (ret2 == -ENODATA) { fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); } + if (ret == 0 && ret2 != 0) + ret = ret2; } - if (fs_devices->open_devices == 0) + + if (fs_devices->open_devices == 0) { + if (ret) + return ret; return -EINVAL; + } fs_devices->opened = 1; fs_devices->latest_dev = latest_dev; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 5a3d5ec75c5a..4cba80b34387 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1574,11 +1574,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) if (!map) return -EINVAL; - cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS); - if (!cache->physical_map) { - ret = -ENOMEM; - goto out; - } + cache->physical_map = map; zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); if (!zone_info) { @@ -1690,7 +1686,6 @@ out: } bitmap_free(active); kfree(zone_info); - btrfs_free_chunk_map(map); return ret; } @@ -2175,6 +2170,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_chunk_map *map; const bool is_metadata = (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int ret = 0; int i; @@ -2250,6 +2246,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); + down_read(&dev_replace->rwsem); map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; @@ -2266,13 +2263,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ zinfo->zone_size >> SECTOR_SHIFT); memalloc_nofs_restore(nofs_flags); - if (ret) + if (ret) { + up_read(&dev_replace->rwsem); return ret; + } if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); } + up_read(&dev_replace->rwsem); if (!fully_written) btrfs_dec_block_group_ro(block_group); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7fb4aae97412..55051ad09c19 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4634,6 +4634,14 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) iput(inode); spin_lock(&mdsc->cap_delay_lock); } + + /* + * Make sure too many dirty caps or general + * slowness doesn't block mdsc delayed work, + * preventing send_renew_caps() from running. + */ + if (jiffies - loop_start >= 5 * HZ) + break; } spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index abe8028d95bf..16873d07692f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1138,7 +1138,12 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, } idx = 0; - left = ret > 0 ? ret : 0; + if (ret <= 0) + left = 0; + else if (off + ret > i_size) + left = i_size - off; + else + left = ret; while (left > 0) { size_t plen, copied; @@ -1167,15 +1172,13 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, } if (ret > 0) { - if (off > *ki_pos) { - if (off >= i_size) { - *retry_op = CHECK_EOF; - ret = i_size - *ki_pos; - *ki_pos = i_size; - } else { - ret = off - *ki_pos; - *ki_pos = off; - } + if (off >= i_size) { + *retry_op = CHECK_EOF; + ret = i_size - *ki_pos; + *ki_pos = i_size; + } else { + ret = off - *ki_pos; + *ki_pos = off; } if (last_objver) @@ -2126,14 +2129,16 @@ again: int statret; struct page *page = NULL; loff_t i_size; + int mask = CEPH_STAT_CAP_SIZE; if (retry_op == READ_INLINE) { page = __page_cache_alloc(GFP_KERNEL); if (!page) return -ENOMEM; + + mask = CEPH_STAT_CAP_INLINE_DATA; } - statret = __ceph_do_getattr(inode, page, - CEPH_STAT_CAP_INLINE_DATA, !!page); + statret = __ceph_do_getattr(inode, page, mask, !!page); if (statret < 0) { if (page) __free_page(page); @@ -2174,7 +2179,7 @@ again: /* hit EOF or hole? */ if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && ret < len) { - doutc(cl, "hit hole, ppos %lld < size %lld, reading more\n", + doutc(cl, "may hit hole, ppos %lld < size %lld, reading more\n", iocb->ki_pos, i_size); read += ret; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 69308fd73e4a..c0eb139adb07 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -430,7 +430,6 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: - warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); set_opt(&ctx->opt, DAX_ALWAYS); clear_opt(&ctx->opt, DAX_NEVER); return true; diff --git a/fs/exec.c b/fs/exec.c index ff6f26671cfc..cf1df7f16e55 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -895,6 +895,7 @@ int transfer_args_to_stack(struct linux_binprm *bprm, goto out; } + bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; *sp_location = sp; out: diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 789af5c8fade..aa1626955b2c 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1718,7 +1718,8 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) struct buffer_head *dibh, *bh; struct gfs2_holder rd_gh; unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; - u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift; + unsigned int bsize = 1 << bsize_shift; + u64 lblock = (offset + bsize - 1) >> bsize_shift; __u16 start_list[GFS2_MAX_META_HEIGHT]; __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL; unsigned int start_aligned, end_aligned; @@ -1729,7 +1730,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) u64 prev_bnr = 0; __be64 *start, *end; - if (offset >= maxsize) { + if (offset + bsize - 1 >= maxsize) { /* * The starting point lies beyond the allocated metadata; * there are no blocks to deallocate. diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 3de5047a7ff9..a0017724d523 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -239,7 +239,8 @@ replay_again: .tcon = tcon, .path = path, .create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE), - .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES, + .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES | + FILE_READ_EA, .disposition = FILE_OPEN, .fid = pfid, .replay = !!(retries), diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 81d9aafd2210..aa6f1ecb7c0e 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -151,10 +151,6 @@ MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be " "vers=1.0 (CIFS/SMB1) and vers=2.0 are weaker" " and less secure. Default: n/N/0"); -extern mempool_t *cifs_sm_req_poolp; -extern mempool_t *cifs_req_poolp; -extern mempool_t *cifs_mid_poolp; - struct workqueue_struct *cifsiod_wq; struct workqueue_struct *decrypt_wq; struct workqueue_struct *fileinfo_put_wq; diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 8be62ed053a2..7ed9d05f6890 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -355,6 +355,9 @@ struct smb_version_operations { /* informational QFS call */ void (*qfs_tcon)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *); + /* query for server interfaces */ + int (*query_server_interfaces)(const unsigned int, struct cifs_tcon *, + bool); /* check if a path is accessible or not */ int (*is_path_accessible)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *); @@ -2104,6 +2107,8 @@ extern struct workqueue_struct *cifsoplockd_wq; extern struct workqueue_struct *deferredclose_wq; extern __u32 cifs_lock_secret; +extern mempool_t *cifs_sm_req_poolp; +extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; /* Operations for different SMB versions */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 86ae578904a2..9b85b5341822 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -52,9 +52,6 @@ #include "fs_context.h" #include "cifs_swn.h" -extern mempool_t *cifs_req_poolp; -extern bool disable_legacy_dialects; - /* FIXME: should these be tunable? */ #define TLINK_ERROR_EXPIRE (1 * HZ) #define TLINK_IDLE_EXPIRE (600 * HZ) @@ -123,12 +120,16 @@ static void smb2_query_server_interfaces(struct work_struct *work) struct cifs_tcon *tcon = container_of(work, struct cifs_tcon, query_interfaces.work); + struct TCP_Server_Info *server = tcon->ses->server; /* * query server network interfaces, in case they change */ + if (!server->ops->query_server_interfaces) + return; + xid = get_xid(); - rc = SMB3_request_interfaces(xid, tcon, false); + rc = server->ops->query_server_interfaces(xid, tcon, false); free_xid(xid); if (rc) { diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index ec25d3c3e1ee..16aadce492b2 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -486,7 +486,6 @@ struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->uid = current_fsuid(); cfile->dentry = dget(dentry); cfile->f_flags = file->f_flags; - cfile->status_file_deleted = false; cfile->invalidHandle = false; cfile->deferred_close_scheduled = false; cfile->tlink = cifs_get_tlink(tlink); @@ -1073,6 +1072,19 @@ void smb2_deferred_work_close(struct work_struct *work) _cifsFileInfo_put(cfile, true, false); } +static bool +smb2_can_defer_close(struct inode *inode, struct cifs_deferred_close *dclose) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsInodeInfo *cinode = CIFS_I(inode); + + return (cifs_sb->ctx->closetimeo && cinode->lease_granted && dclose && + (cinode->oplock == CIFS_CACHE_RHW_FLG || + cinode->oplock == CIFS_CACHE_RH_FLG) && + !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags)); + +} + int cifs_close(struct inode *inode, struct file *file) { struct cifsFileInfo *cfile; @@ -1086,10 +1098,8 @@ int cifs_close(struct inode *inode, struct file *file) cfile = file->private_data; file->private_data = NULL; dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); - if ((cifs_sb->ctx->closetimeo && cinode->oplock == CIFS_CACHE_RHW_FLG) - && cinode->lease_granted && - !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) && - dclose && !(cfile->status_file_deleted)) { + if ((cfile->status_file_deleted == false) && + (smb2_can_defer_close(inode, dclose))) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 8177ec59afee..d28ab0af6049 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -401,7 +401,6 @@ cifs_get_file_info_unix(struct file *filp) cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); } else if (rc == -EREMOTE) { cifs_create_junction_fattr(&fattr, inode->i_sb); - rc = 0; } else goto cifs_gfiunix_out; @@ -820,8 +819,10 @@ cifs_get_file_info(struct file *filp) void *page = alloc_dentry_path(); const unsigned char *path; - if (!server->ops->query_file_info) + if (!server->ops->query_file_info) { + free_dentry_path(page); return -ENOSYS; + } xid = get_xid(); rc = server->ops->query_file_info(xid, tcon, cfile, &data); @@ -835,8 +836,8 @@ cifs_get_file_info(struct file *filp) } path = build_path_from_dentry(dentry, page); if (IS_ERR(path)) { - free_dentry_path(page); - return PTR_ERR(path); + rc = PTR_ERR(path); + goto cgfi_exit; } cifs_open_info_to_fattr(&fattr, &data, inode->i_sb); if (fattr.cf_flags & CIFS_FATTR_DELETE_PENDING) @@ -844,7 +845,6 @@ cifs_get_file_info(struct file *filp) break; case -EREMOTE: cifs_create_junction_fattr(&fattr, inode->i_sb); - rc = 0; break; case -EOPNOTSUPP: case -EINVAL: @@ -1009,7 +1009,6 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, struct kvec rsp_iov, *iov = NULL; int rsp_buftype = CIFS_NO_BUFFER; u32 tag = data->reparse.tag; - struct inode *inode = NULL; int rc = 0; if (!tag && server->ops->query_reparse_point) { @@ -1049,12 +1048,8 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, if (tcon->posix_extensions) smb311_posix_info_to_fattr(fattr, data, sb); - else { + else cifs_open_info_to_fattr(fattr, data, sb); - inode = cifs_iget(sb, fattr); - if (inode && fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) - cifs_mark_open_handles_for_deleted_file(inode, full_path); - } out: fattr->cf_cifstag = data->reparse.tag; free_rsp_buf(rsp_buftype, rsp_iov.iov_base); @@ -1109,9 +1104,9 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, full_path, fattr); } else { cifs_open_info_to_fattr(fattr, data, sb); - if (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) - cifs_mark_open_handles_for_deleted_file(*inode, full_path); } + if (!rc && fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) + cifs_mark_open_handles_for_deleted_file(*inode, full_path); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1340,6 +1335,8 @@ int smb311_posix_get_inode_info(struct inode **inode, goto out; rc = update_inode_info(sb, &fattr, inode); + if (!rc && fattr.cf_flags & CIFS_FATTR_DELETE_PENDING) + cifs_mark_open_handles_for_deleted_file(*inode, full_path); out: kfree(fattr.cf_symlink_target); return rc; @@ -1501,6 +1498,9 @@ iget_root: goto out; } + if (!rc && fattr.cf_flags & CIFS_FATTR_DELETE_PENDING) + cifs_mark_open_handles_for_deleted_file(inode, path); + if (rc && tcon->pipe) { cifs_dbg(FYI, "ipc connection - fake read inode\n"); spin_lock(&inode->i_lock); diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 9428a0db7718..c3771fc81328 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -27,9 +27,6 @@ #include "fs_context.h" #include "cached_dir.h" -extern mempool_t *cifs_sm_req_poolp; -extern mempool_t *cifs_req_poolp; - /* The xid serves as a useful identifier for each incoming vfs request, in a similar way to the mid which is useful to track each sent smb, and CurrentXid can also provide a running counter (although it diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 8f37373fd333..3216f786908f 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -230,7 +230,7 @@ int cifs_try_adding_channels(struct cifs_ses *ses) spin_lock(&ses->iface_lock); if (!ses->iface_count) { spin_unlock(&ses->iface_lock); - cifs_dbg(VFS, "server %s does not advertise interfaces\n", + cifs_dbg(ONCE, "server %s does not advertise interfaces\n", ses->server->hostname); break; } @@ -396,7 +396,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) spin_lock(&ses->iface_lock); if (!ses->iface_count) { spin_unlock(&ses->iface_lock); - cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname); + cifs_dbg(ONCE, "server %s does not advertise interfaces\n", ses->server->hostname); return; } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 6ee22d0dbc00..2ed456948f34 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -5290,6 +5290,7 @@ struct smb_version_operations smb30_operations = { .tree_connect = SMB2_tcon, .tree_disconnect = SMB2_tdis, .qfs_tcon = smb3_qfs_tcon, + .query_server_interfaces = SMB3_request_interfaces, .is_path_accessible = smb2_is_path_accessible, .can_echo = smb2_can_echo, .echo = SMB2_echo, @@ -5405,6 +5406,7 @@ struct smb_version_operations smb311_operations = { .tree_connect = SMB2_tcon, .tree_disconnect = SMB2_tdis, .qfs_tcon = smb3_qfs_tcon, + .query_server_interfaces = SMB3_request_interfaces, .is_path_accessible = smb2_is_path_accessible, .can_echo = smb2_can_echo, .echo = SMB2_echo, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index e5e6b14f8cae..3ea688558e6c 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -409,14 +409,15 @@ skip_sess_setup: spin_unlock(&ses->ses_lock); if (!rc && - (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { + (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL) && + server->ops->query_server_interfaces) { mutex_unlock(&ses->session_mutex); /* * query server network interfaces, in case they change */ xid = get_xid(); - rc = SMB3_request_interfaces(xid, tcon, false); + rc = server->ops->query_server_interfaces(xid, tcon, false); free_xid(xid); if (rc == -EOPNOTSUPP && ses->chan_count > 1) { diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 20784f76a604..1b594307c9d5 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -227,7 +227,7 @@ struct smb2_compression_hdr { __le32 OriginalCompressedSegmentSize; __le16 CompressionAlgorithm; __le16 Flags; - __le16 Offset; /* this is the size of the uncompressed SMB2 header below */ + __le32 Offset; /* this is the size of the uncompressed SMB2 header below */ /* uncompressed SMB2 header (READ or WRITE) goes here */ /* compressed data goes here */ } __packed; @@ -280,15 +280,16 @@ struct smb3_blob_data { #define SE_GROUP_RESOURCE 0x20000000 #define SE_GROUP_LOGON_ID 0xC0000000 -/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */ - struct sid_array_data { __le16 SidAttrCount; /* SidAttrList - array of sid_attr_data structs */ } __packed; -struct luid_attr_data { - +/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */ +struct sid_attr_data { + __le16 BlobSize; + __u8 BlobData[]; + /* __le32 Attr */ } __packed; /* @@ -502,6 +503,7 @@ struct smb2_encryption_neg_context { #define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003) /* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */ #define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */ +#define SMB3_COMPRESS_LZ4 cpu_to_le16(0x0005) /* Compression Flags */ #define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index d013c5b3f1ed..ac77ac1fd73e 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -1742,17 +1742,22 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) err = dbg_walk_index(c, NULL, add_size, &calc); if (err) { ubifs_err(c, "error %d while walking the index", err); - return err; + goto out_err; } if (calc != idx_size) { ubifs_err(c, "index size check failed: calculated size is %lld, should be %lld", calc, idx_size); dump_stack(); - return -EINVAL; + err = -EINVAL; + goto out_err; } return 0; + +out_err: + ubifs_destroy_tnc_tree(c); + return err; } /** diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 551148de66cd..eac0fef801f1 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1133,6 +1133,8 @@ out_cancel: dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); out_inode: + /* Free inode->i_link before inode is marked as bad. */ + fscrypt_free_inode(inode); make_bad_inode(inode); iput(inode); out_fname: diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5029eb3390a5..a1f46919934c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -96,36 +96,36 @@ dump: return -EINVAL; } -static int do_readpage(struct page *page) +static int do_readpage(struct folio *folio) { void *addr; int err = 0, i; unsigned int block, beyond; - struct ubifs_data_node *dn; - struct inode *inode = page->mapping->host; + struct ubifs_data_node *dn = NULL; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; loff_t i_size = i_size_read(inode); dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, page->index, i_size, page->flags); - ubifs_assert(c, !PageChecked(page)); - ubifs_assert(c, !PagePrivate(page)); + inode->i_ino, folio->index, i_size, folio->flags); + ubifs_assert(c, !folio_test_checked(folio)); + ubifs_assert(c, !folio->private); - addr = kmap(page); + addr = kmap_local_folio(folio, 0); - block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; if (block >= beyond) { /* Reading beyond inode */ - SetPageChecked(page); - memset(addr, 0, PAGE_SIZE); + folio_set_checked(folio); + addr = folio_zero_tail(folio, 0, addr); goto out; } dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS); if (!dn) { err = -ENOMEM; - goto error; + goto out; } i = 0; @@ -150,39 +150,35 @@ static int do_readpage(struct page *page) memset(addr + ilen, 0, dlen - ilen); } } - if (++i >= UBIFS_BLOCKS_PER_PAGE) + if (++i >= (UBIFS_BLOCKS_PER_PAGE << folio_order(folio))) break; block += 1; addr += UBIFS_BLOCK_SIZE; + if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { + kunmap_local(addr - UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); + } } + if (err) { struct ubifs_info *c = inode->i_sb->s_fs_info; if (err == -ENOENT) { /* Not found, so it must be a hole */ - SetPageChecked(page); + folio_set_checked(folio); dbg_gen("hole"); - goto out_free; + err = 0; + } else { + ubifs_err(c, "cannot read page %lu of inode %lu, error %d", + folio->index, inode->i_ino, err); } - ubifs_err(c, "cannot read page %lu of inode %lu, error %d", - page->index, inode->i_ino, err); - goto error; } -out_free: - kfree(dn); out: - SetPageUptodate(page); - ClearPageError(page); - flush_dcache_page(page); - kunmap(page); - return 0; - -error: kfree(dn); - ClearPageUptodate(page); - SetPageError(page); - flush_dcache_page(page); - kunmap(page); + if (!err) + folio_mark_uptodate(folio); + flush_dcache_folio(folio); + kunmap_local(addr); return err; } @@ -222,16 +218,16 @@ static int write_begin_slow(struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; struct ubifs_budget_req req = { .new_page = 1 }; int err, appending = !!(pos + len > inode->i_size); - struct page *page; + struct folio *folio; dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", inode->i_ino, pos, len, inode->i_size); /* - * At the slow path we have to budget before locking the page, because - * budgeting may force write-back, which would wait on locked pages and - * deadlock if we had the page locked. At this point we do not know - * anything about the page, so assume that this is a new page which is + * At the slow path we have to budget before locking the folio, because + * budgeting may force write-back, which would wait on locked folios and + * deadlock if we had the folio locked. At this point we do not know + * anything about the folio, so assume that this is a new folio which is * written to a hole. This corresponds to largest budget. Later the * budget will be amended if this is not true. */ @@ -243,45 +239,43 @@ static int write_begin_slow(struct address_space *mapping, if (unlikely(err)) return err; - page = grab_cache_page_write_begin(mapping, index); - if (unlikely(!page)) { + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { ubifs_release_budget(c, &req); - return -ENOMEM; + return PTR_ERR(folio); } - if (!PageUptodate(page)) { - if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) - SetPageChecked(page); + if (!folio_test_uptodate(folio)) { + if (pos == folio_pos(folio) && len >= folio_size(folio)) + folio_set_checked(folio); else { - err = do_readpage(page); + err = do_readpage(folio); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ubifs_release_budget(c, &req); return err; } } - - SetPageUptodate(page); - ClearPageError(page); } - if (PagePrivate(page)) + if (folio->private) /* - * The page is dirty, which means it was budgeted twice: + * The folio is dirty, which means it was budgeted twice: * o first time the budget was allocated by the task which - * made the page dirty and set the PG_private flag; + * made the folio dirty and set the private field; * o and then we budgeted for it for the second time at the * very beginning of this function. * - * So what we have to do is to release the page budget we + * So what we have to do is to release the folio budget we * allocated. */ release_new_page_budget(c); - else if (!PageChecked(page)) + else if (!folio_test_checked(folio)) /* - * We are changing a page which already exists on the media. - * This means that changing the page does not make the amount + * We are changing a folio which already exists on the media. + * This means that changing the folio does not make the amount * of indexing information larger, and this part of the budget * which we have already acquired may be released. */ @@ -304,14 +298,14 @@ static int write_begin_slow(struct address_space *mapping, ubifs_release_dirty_inode_budget(c, ui); } - *pagep = page; + *pagep = &folio->page; return 0; } /** * allocate_budget - allocate budget for 'ubifs_write_begin()'. * @c: UBIFS file-system description object - * @page: page to allocate budget for + * @folio: folio to allocate budget for * @ui: UBIFS inode object the page belongs to * @appending: non-zero if the page is appended * @@ -322,15 +316,15 @@ static int write_begin_slow(struct address_space *mapping, * * Returns: %0 in case of success and %-ENOSPC in case of failure. */ -static int allocate_budget(struct ubifs_info *c, struct page *page, +static int allocate_budget(struct ubifs_info *c, struct folio *folio, struct ubifs_inode *ui, int appending) { struct ubifs_budget_req req = { .fast = 1 }; - if (PagePrivate(page)) { + if (folio->private) { if (!appending) /* - * The page is dirty and we are not appending, which + * The folio is dirty and we are not appending, which * means no budget is needed at all. */ return 0; @@ -354,11 +348,11 @@ static int allocate_budget(struct ubifs_info *c, struct page *page, */ req.dirtied_ino = 1; } else { - if (PageChecked(page)) + if (folio_test_checked(folio)) /* * The page corresponds to a hole and does not * exist on the media. So changing it makes - * make the amount of indexing information + * the amount of indexing information * larger, and we have to budget for a new * page. */ @@ -428,7 +422,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; int err, appending = !!(pos + len > inode->i_size); int skipped_read = 0; - struct page *page; + struct folio *folio; ubifs_assert(c, ubifs_inode(inode)->ui_size == inode->i_size); ubifs_assert(c, !c->ro_media && !c->ro_mount); @@ -437,13 +431,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, return -EROFS; /* Try out the fast-path part first */ - page = grab_cache_page_write_begin(mapping, index); - if (unlikely(!page)) - return -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* The page is not loaded from the flash */ - if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) { + if (pos == folio_pos(folio) && len >= folio_size(folio)) { /* * We change whole page so no need to load it. But we * do not know whether this page exists on the media or @@ -453,32 +448,27 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * media. Thus, we are setting the @PG_checked flag * here. */ - SetPageChecked(page); + folio_set_checked(folio); skipped_read = 1; } else { - err = do_readpage(page); + err = do_readpage(folio); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } } - - SetPageUptodate(page); - ClearPageError(page); } - err = allocate_budget(c, page, ui, appending); + err = allocate_budget(c, folio, ui, appending); if (unlikely(err)) { ubifs_assert(c, err == -ENOSPC); /* * If we skipped reading the page because we were going to * write all of it, then it is not up to date. */ - if (skipped_read) { - ClearPageChecked(page); - ClearPageUptodate(page); - } + if (skipped_read) + folio_clear_checked(folio); /* * Budgeting failed which means it would have to force * write-back but didn't, because we set the @fast flag in the @@ -490,8 +480,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); mutex_unlock(&ui->ui_mutex); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return write_begin_slow(mapping, pos, len, pagep); } @@ -502,22 +492,21 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * with @ui->ui_mutex locked if we are appending pages, and unlocked * otherwise. This is an optimization (slightly hacky though). */ - *pagep = page; + *pagep = &folio->page; return 0; - } /** * cancel_budget - cancel budget. * @c: UBIFS file-system description object - * @page: page to cancel budget for + * @folio: folio to cancel budget for * @ui: UBIFS inode object the page belongs to * @appending: non-zero if the page is appended * * This is a helper function for a page write operation. It unlocks the * @ui->ui_mutex in case of appending. */ -static void cancel_budget(struct ubifs_info *c, struct page *page, +static void cancel_budget(struct ubifs_info *c, struct folio *folio, struct ubifs_inode *ui, int appending) { if (appending) { @@ -525,8 +514,8 @@ static void cancel_budget(struct ubifs_info *c, struct page *page, ubifs_release_dirty_inode_budget(c, ui); mutex_unlock(&ui->ui_mutex); } - if (!PagePrivate(page)) { - if (PageChecked(page)) + if (!folio->private) { + if (folio_test_checked(folio)) release_new_page_budget(c); else release_existing_page_budget(c); @@ -537,6 +526,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -544,44 +534,47 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, int appending = !!(end_pos > inode->i_size); dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", - inode->i_ino, pos, page->index, len, copied, inode->i_size); + inode->i_ino, pos, folio->index, len, copied, inode->i_size); - if (unlikely(copied < len && len == PAGE_SIZE)) { + if (unlikely(copied < len && !folio_test_uptodate(folio))) { /* - * VFS copied less data to the page that it intended and + * VFS copied less data to the folio than it intended and * declared in its '->write_begin()' call via the @len - * argument. If the page was not up-to-date, and @len was - * @PAGE_SIZE, the 'ubifs_write_begin()' function did + * argument. If the folio was not up-to-date, + * the 'ubifs_write_begin()' function did * not load it from the media (for optimization reasons). This - * means that part of the page contains garbage. So read the - * page now. + * means that part of the folio contains garbage. So read the + * folio now. */ dbg_gen("copied %d instead of %d, read page and repeat", copied, len); - cancel_budget(c, page, ui, appending); - ClearPageChecked(page); + cancel_budget(c, folio, ui, appending); + folio_clear_checked(folio); /* * Return 0 to force VFS to repeat the whole operation, or the * error code if 'do_readpage()' fails. */ - copied = do_readpage(page); + copied = do_readpage(folio); goto out; } - if (!PagePrivate(page)) { - attach_page_private(page, (void *)1); + if (len == folio_size(folio)) + folio_mark_uptodate(folio); + + if (!folio->private) { + folio_attach_private(folio, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); - __set_page_dirty_nobuffers(page); + filemap_dirty_folio(mapping, folio); } if (appending) { i_size_write(inode, end_pos); ui->ui_size = end_pos; /* - * Note, we do not set @I_DIRTY_PAGES (which means that the - * inode has dirty pages), this has been done in - * '__set_page_dirty_nobuffers()'. + * We do not set @I_DIRTY_PAGES (which means that + * the inode has dirty pages), this was done in + * filemap_dirty_folio(). */ __mark_inode_dirty(inode, I_DIRTY_DATASYNC); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); @@ -589,43 +582,43 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, } out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } /** * populate_page - copy data nodes into a page for bulk-read. * @c: UBIFS file-system description object - * @page: page + * @folio: folio * @bu: bulk-read information * @n: next zbranch slot * * Returns: %0 on success and a negative error code on failure. */ -static int populate_page(struct ubifs_info *c, struct page *page, +static int populate_page(struct ubifs_info *c, struct folio *folio, struct bu_info *bu, int *n) { int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; loff_t i_size = i_size_read(inode); unsigned int page_block; void *addr, *zaddr; pgoff_t end_index; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, page->index, i_size, page->flags); + inode->i_ino, folio->index, i_size, folio->flags); - addr = zaddr = kmap(page); + addr = zaddr = kmap_local_folio(folio, 0); end_index = (i_size - 1) >> PAGE_SHIFT; - if (!i_size || page->index > end_index) { + if (!i_size || folio->index > end_index) { hole = 1; - memset(addr, 0, PAGE_SIZE); + addr = folio_zero_tail(folio, 0, addr); goto out_hole; } - page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + page_block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; while (1) { int err, len, out_len, dlen; @@ -674,9 +667,13 @@ static int populate_page(struct ubifs_info *c, struct page *page, break; addr += UBIFS_BLOCK_SIZE; page_block += 1; + if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { + kunmap_local(addr - UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); + } } - if (end_index == page->index) { + if (end_index == folio->index) { int len = i_size & (PAGE_SIZE - 1); if (len && len < read) @@ -685,22 +682,19 @@ static int populate_page(struct ubifs_info *c, struct page *page, out_hole: if (hole) { - SetPageChecked(page); + folio_set_checked(folio); dbg_gen("hole"); } - SetPageUptodate(page); - ClearPageError(page); - flush_dcache_page(page); - kunmap(page); + folio_mark_uptodate(folio); + flush_dcache_folio(folio); + kunmap_local(addr); *n = nn; return 0; out_err: - ClearPageUptodate(page); - SetPageError(page); - flush_dcache_page(page); - kunmap(page); + flush_dcache_folio(folio); + kunmap_local(addr); ubifs_err(c, "bad data node (block %u, inode %lu)", page_block, inode->i_ino); return -EINVAL; @@ -710,15 +704,15 @@ out_err: * ubifs_do_bulk_read - do bulk-read. * @c: UBIFS file-system description object * @bu: bulk-read information - * @page1: first page to read + * @folio1: first folio to read * * Returns: %1 if the bulk-read is done, otherwise %0 is returned. */ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, - struct page *page1) + struct folio *folio1) { - pgoff_t offset = page1->index, end_index; - struct address_space *mapping = page1->mapping; + pgoff_t offset = folio1->index, end_index; + struct address_space *mapping = folio1->mapping; struct inode *inode = mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); int err, page_idx, page_cnt, ret = 0, n = 0; @@ -768,11 +762,11 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, goto out_warn; } - err = populate_page(c, page1, bu, &n); + err = populate_page(c, folio1, bu, &n); if (err) goto out_warn; - unlock_page(page1); + folio_unlock(folio1); ret = 1; isize = i_size_read(inode); @@ -782,19 +776,19 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, for (page_idx = 1; page_idx < page_cnt; page_idx++) { pgoff_t page_offset = offset + page_idx; - struct page *page; + struct folio *folio; if (page_offset > end_index) break; - page = pagecache_get_page(mapping, page_offset, + folio = __filemap_get_folio(mapping, page_offset, FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT, ra_gfp_mask); - if (!page) + if (IS_ERR(folio)) break; - if (!PageUptodate(page)) - err = populate_page(c, page, bu, &n); - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) + err = populate_page(c, folio, bu, &n); + folio_unlock(folio); + folio_put(folio); if (err) break; } @@ -817,7 +811,7 @@ out_bu_off: /** * ubifs_bulk_read - determine whether to bulk-read and, if so, do it. - * @page: page from which to start bulk-read. + * @folio: folio from which to start bulk-read. * * Some flash media are capable of reading sequentially at faster rates. UBIFS * bulk-read facility is designed to take advantage of that, by reading in one @@ -826,12 +820,12 @@ out_bu_off: * * Returns: %1 if a bulk-read is done and %0 otherwise. */ -static int ubifs_bulk_read(struct page *page) +static int ubifs_bulk_read(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); - pgoff_t index = page->index, last_page_read = ui->last_page_read; + pgoff_t index = folio->index, last_page_read = ui->last_page_read; struct bu_info *bu; int err = 0, allocated = 0; @@ -879,8 +873,8 @@ static int ubifs_bulk_read(struct page *page) bu->buf_len = c->max_bu_buf_len; data_key_init(c, &bu->key, inode->i_ino, - page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); - err = ubifs_do_bulk_read(c, bu, page); + folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); + err = ubifs_do_bulk_read(c, bu, folio); if (!allocated) mutex_unlock(&c->bu_mutex); @@ -894,69 +888,71 @@ out_unlock: static int ubifs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - - if (ubifs_bulk_read(page)) + if (ubifs_bulk_read(folio)) return 0; - do_readpage(page); + do_readpage(folio); folio_unlock(folio); return 0; } -static int do_writepage(struct page *page, int len) +static int do_writepage(struct folio *folio, size_t len) { - int err = 0, i, blen; + int err = 0, blen; unsigned int block; void *addr; + size_t offset = 0; union ubifs_key key; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; #ifdef UBIFS_DEBUG struct ubifs_inode *ui = ubifs_inode(inode); spin_lock(&ui->ui_lock); - ubifs_assert(c, page->index <= ui->synced_i_size >> PAGE_SHIFT); + ubifs_assert(c, folio->index <= ui->synced_i_size >> PAGE_SHIFT); spin_unlock(&ui->ui_lock); #endif - /* Update radix tree tags */ - set_page_writeback(page); + folio_start_writeback(folio); - addr = kmap(page); - block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; - i = 0; - while (len) { - blen = min_t(int, len, UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, offset); + block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + for (;;) { + blen = min_t(size_t, len, UBIFS_BLOCK_SIZE); data_key_init(c, &key, inode->i_ino, block); err = ubifs_jnl_write_data(c, inode, &key, addr, blen); if (err) break; - if (++i >= UBIFS_BLOCKS_PER_PAGE) + len -= blen; + if (!len) break; block += 1; addr += blen; - len -= blen; + if (folio_test_highmem(folio) && !offset_in_page(addr)) { + kunmap_local(addr - blen); + offset += PAGE_SIZE; + addr = kmap_local_folio(folio, offset); + } } + kunmap_local(addr); if (err) { - SetPageError(page); - ubifs_err(c, "cannot write page %lu of inode %lu, error %d", - page->index, inode->i_ino, err); + mapping_set_error(folio->mapping, err); + ubifs_err(c, "cannot write folio %lu of inode %lu, error %d", + folio->index, inode->i_ino, err); ubifs_ro_mode(c, err); } - ubifs_assert(c, PagePrivate(page)); - if (PageChecked(page)) + ubifs_assert(c, folio->private != NULL); + if (folio_test_checked(folio)) release_new_page_budget(c); else release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - detach_page_private(page); - ClearPageChecked(page); + folio_detach_private(folio); + folio_clear_checked(folio); - kunmap(page); - unlock_page(page); - end_page_writeback(page); + folio_unlock(folio); + folio_end_writeback(folio); return err; } @@ -1006,22 +1002,21 @@ static int do_writepage(struct page *page, int len) * on the page lock and it would not write the truncated inode node to the * journal before we have finished. */ -static int ubifs_writepage(struct page *page, struct writeback_control *wbc) +static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, + void *data) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); loff_t i_size = i_size_read(inode), synced_i_size; - pgoff_t end_index = i_size >> PAGE_SHIFT; - int err, len = i_size & (PAGE_SIZE - 1); - void *kaddr; + int err, len = folio_size(folio); dbg_gen("ino %lu, pg %lu, pg flags %#lx", - inode->i_ino, page->index, page->flags); - ubifs_assert(c, PagePrivate(page)); + inode->i_ino, folio->index, folio->flags); + ubifs_assert(c, folio->private != NULL); - /* Is the page fully outside @i_size? (truncate in progress) */ - if (page->index > end_index || (page->index == end_index && !len)) { + /* Is the folio fully outside @i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { err = 0; goto out_unlock; } @@ -1030,9 +1025,9 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) synced_i_size = ui->synced_i_size; spin_unlock(&ui->ui_lock); - /* Is the page fully inside @i_size? */ - if (page->index < end_index) { - if (page->index >= synced_i_size >> PAGE_SHIFT) { + /* Is the folio fully inside i_size? */ + if (folio_pos(folio) + len <= i_size) { + if (folio_pos(folio) >= synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_redirty; @@ -1045,20 +1040,18 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) * with this. */ } - return do_writepage(page, PAGE_SIZE); + return do_writepage(folio, len); } /* - * The page straddles @i_size. It must be zeroed out on each and every + * The folio straddles @i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - kaddr = kmap_atomic(page); - memset(kaddr + len, 0, PAGE_SIZE - len); - flush_dcache_page(page); - kunmap_atomic(kaddr); + len = i_size - folio_pos(folio); + folio_zero_segment(folio, len, folio_size(folio)); if (i_size > synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); @@ -1066,19 +1059,25 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) goto out_redirty; } - return do_writepage(page, len); + return do_writepage(folio, len); out_redirty: /* - * redirty_page_for_writepage() won't call ubifs_dirty_inode() because + * folio_redirty_for_writepage() won't call ubifs_dirty_inode() because * it passes I_DIRTY_PAGES flag while calling __mark_inode_dirty(), so * there is no need to do space budget for dirty inode. */ - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); out_unlock: - unlock_page(page); + folio_unlock(folio); return err; } +static int ubifs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return write_cache_pages(mapping, wbc, ubifs_writepage, NULL); +} + /** * do_attr_changes - change inode attributes. * @inode: inode to change attributes for @@ -1155,11 +1154,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, if (offset) { pgoff_t index = new_size >> PAGE_SHIFT; - struct page *page; + struct folio *folio; - page = find_lock_page(inode->i_mapping, index); - if (page) { - if (PageDirty(page)) { + folio = filemap_lock_folio(inode->i_mapping, index); + if (!IS_ERR(folio)) { + if (folio_test_dirty(folio)) { /* * 'ubifs_jnl_truncate()' will try to truncate * the last data node, but it contains @@ -1168,14 +1167,14 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, * 'ubifs_jnl_truncate()' will see an already * truncated (and up to date) data node. */ - ubifs_assert(c, PagePrivate(page)); + ubifs_assert(c, folio->private != NULL); - clear_page_dirty_for_io(page); + folio_clear_dirty_for_io(folio); if (UBIFS_BLOCKS_PER_PAGE_SHIFT) - offset = new_size & - (PAGE_SIZE - 1); - err = do_writepage(page, offset); - put_page(page); + offset = offset_in_folio(folio, + new_size); + err = do_writepage(folio, offset); + folio_put(folio); if (err) goto out_budg; /* @@ -1188,8 +1187,8 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, * to 'ubifs_jnl_truncate()' to save it from * having to read it. */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } } } @@ -1512,14 +1511,14 @@ static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags) */ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec64 now = current_time(inode); struct ubifs_budget_req req = { .new_page = 1 }; int err, update_time; - dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, + dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, folio->index, i_size_read(inode)); ubifs_assert(c, !c->ro_media && !c->ro_mount); @@ -1527,17 +1526,17 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* -EROFS */ /* - * We have not locked @page so far so we may budget for changing the - * page. Note, we cannot do this after we locked the page, because + * We have not locked @folio so far so we may budget for changing the + * folio. Note, we cannot do this after we locked the folio, because * budgeting may cause write-back which would cause deadlock. * - * At the moment we do not know whether the page is dirty or not, so we - * assume that it is not and budget for a new page. We could look at + * At the moment we do not know whether the folio is dirty or not, so we + * assume that it is not and budget for a new folio. We could look at * the @PG_private flag and figure this out, but we may race with write - * back and the page state may change by the time we lock it, so this + * back and the folio state may change by the time we lock it, so this * would need additional care. We do not bother with this at the * moment, although it might be good idea to do. Instead, we allocate - * budget for a new page and amend it later on if the page was in fact + * budget for a new folio and amend it later on if the folio was in fact * dirty. * * The budgeting-related logic of this function is similar to what we @@ -1560,21 +1559,21 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } - lock_page(page); - if (unlikely(page->mapping != inode->i_mapping || - page_offset(page) > i_size_read(inode))) { - /* Page got truncated out from underneath us */ + folio_lock(folio); + if (unlikely(folio->mapping != inode->i_mapping || + folio_pos(folio) >= i_size_read(inode))) { + /* Folio got truncated out from underneath us */ goto sigbus; } - if (PagePrivate(page)) + if (folio->private) release_new_page_budget(c); else { - if (!PageChecked(page)) + if (!folio_test_checked(folio)) ubifs_convert_page_budget(c); - attach_page_private(page, (void *)1); + folio_attach_private(folio, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); - __set_page_dirty_nobuffers(page); + filemap_dirty_folio(folio->mapping, folio); } if (update_time) { @@ -1590,11 +1589,11 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) ubifs_release_dirty_inode_budget(c, ui); } - wait_for_stable_page(page); + folio_wait_stable(folio); return VM_FAULT_LOCKED; sigbus: - unlock_page(page); + folio_unlock(folio); ubifs_release_budget(c, &req); return VM_FAULT_SIGBUS; } @@ -1648,7 +1647,7 @@ static int ubifs_symlink_getattr(struct mnt_idmap *idmap, const struct address_space_operations ubifs_file_address_operations = { .read_folio = ubifs_read_folio, - .writepage = ubifs_writepage, + .writepages = ubifs_writepages, .write_begin = ubifs_write_begin, .write_end = ubifs_write_end, .invalidate_folio = ubifs_invalidate_folio, diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 873e6e1c92b5..6ebf3c04ac5f 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -82,8 +82,9 @@ static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops) */ static int scan_for_dirty_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -166,8 +167,7 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, data.pick_free = pick_free; data.lnum = -1; data.exclude_index = exclude_index; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_dirty_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_dirty_cb, &data); if (err) return ERR_PTR(err); @@ -349,8 +349,9 @@ out: */ static int scan_for_free_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -446,7 +447,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, data.pick_free = pick_free; data.lnum = -1; err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_free_cb, + scan_for_free_cb, &data); if (err) return ERR_PTR(err); @@ -589,8 +590,9 @@ out: */ static int scan_for_idx_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -625,8 +627,7 @@ static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c) int err; data.lnum = -1; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_idx_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_idx_cb, &data); if (err) return ERR_PTR(err); @@ -726,11 +727,10 @@ out: return err; } -static int cmp_dirty_idx(const struct ubifs_lprops **a, - const struct ubifs_lprops **b) +static int cmp_dirty_idx(const void *a, const void *b) { - const struct ubifs_lprops *lpa = *a; - const struct ubifs_lprops *lpb = *b; + const struct ubifs_lprops *lpa = *(const struct ubifs_lprops **)a; + const struct ubifs_lprops *lpb = *(const struct ubifs_lprops **)b; return lpa->dirty + lpa->free - lpb->dirty - lpb->free; } @@ -754,7 +754,7 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) sizeof(void *) * c->dirty_idx.cnt); /* Sort it so that the dirtiest is now at the end */ sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *), - (int (*)(const void *, const void *))cmp_dirty_idx, NULL); + cmp_dirty_idx, NULL); dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt); if (c->dirty_idx.cnt) dbg_find("dirtiest index LEB is %d with dirty %d and free %d", @@ -782,8 +782,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) */ static int scan_dirty_idx_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -842,8 +843,7 @@ static int find_dirty_idx_leb(struct ubifs_info *c) if (c->pnodes_have >= c->pnode_cnt) /* All pnodes are in memory, so skip scan */ return -ENOSPC; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_dirty_idx_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_dirty_idx_cb, &data); if (err) return err; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index f0a5538c84b0..74aee92433d7 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -293,6 +293,96 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, } /** + * __queue_and_wait - queue a task and wait until the task is waked up. + * @c: UBIFS file-system description object + * + * This function adds current task in queue and waits until the task is waked + * up. This function should be called with @c->reserve_space_wq locked. + */ +static void __queue_and_wait(struct ubifs_info *c) +{ + DEFINE_WAIT(wait); + + __add_wait_queue_entry_tail_exclusive(&c->reserve_space_wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&c->reserve_space_wq.lock); + + schedule(); + finish_wait(&c->reserve_space_wq, &wait); +} + +/** + * wait_for_reservation - try queuing current task to wait until waked up. + * @c: UBIFS file-system description object + * + * This function queues current task to wait until waked up, if queuing is + * started(@c->need_wait_space is not %0). Returns %true if current task is + * added in queue, otherwise %false is returned. + */ +static bool wait_for_reservation(struct ubifs_info *c) +{ + if (likely(atomic_read(&c->need_wait_space) == 0)) + /* Quick path to check whether queuing is started. */ + return false; + + spin_lock(&c->reserve_space_wq.lock); + if (atomic_read(&c->need_wait_space) == 0) { + /* Queuing is not started, don't queue current task. */ + spin_unlock(&c->reserve_space_wq.lock); + return false; + } + + __queue_and_wait(c); + return true; +} + +/** + * wake_up_reservation - wake up first task in queue or stop queuing. + * @c: UBIFS file-system description object + * + * This function wakes up the first task in queue if it exists, or stops + * queuing if no tasks in queue. + */ +static void wake_up_reservation(struct ubifs_info *c) +{ + spin_lock(&c->reserve_space_wq.lock); + if (waitqueue_active(&c->reserve_space_wq)) + wake_up_locked(&c->reserve_space_wq); + else + /* + * Compared with wait_for_reservation(), set @c->need_wait_space + * under the protection of wait queue lock, which can avoid that + * @c->need_wait_space is set to 0 after new task queued. + */ + atomic_set(&c->need_wait_space, 0); + spin_unlock(&c->reserve_space_wq.lock); +} + +/** + * wake_up_reservation - add current task in queue or start queuing. + * @c: UBIFS file-system description object + * + * This function starts queuing if queuing is not started, otherwise adds + * current task in queue. + */ +static void add_or_start_queue(struct ubifs_info *c) +{ + spin_lock(&c->reserve_space_wq.lock); + if (atomic_cmpxchg(&c->need_wait_space, 0, 1) == 0) { + /* Starts queuing, task can go on directly. */ + spin_unlock(&c->reserve_space_wq.lock); + return; + } + + /* + * There are at least two tasks have retried more than 32 times + * at certain point, first task has started queuing, just queue + * the left tasks. + */ + __queue_and_wait(c); +} + +/** * make_reservation - reserve journal space. * @c: UBIFS file-system description object * @jhead: journal head @@ -311,33 +401,27 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, static int make_reservation(struct ubifs_info *c, int jhead, int len) { int err, cmt_retries = 0, nospc_retries = 0; + bool blocked = wait_for_reservation(c); again: down_read(&c->commit_sem); err = reserve_space(c, jhead, len); - if (!err) + if (!err) { /* c->commit_sem will get released via finish_reservation(). */ - return 0; + goto out_wake_up; + } up_read(&c->commit_sem); if (err == -ENOSPC) { /* * GC could not make any progress. We should try to commit - * once because it could make some dirty space and GC would - * make progress, so make the error -EAGAIN so that the below + * because it could make some dirty space and GC would make + * progress, so make the error -EAGAIN so that the below * will commit and re-try. */ - if (nospc_retries++ < 2) { - dbg_jnl("no space, retry"); - err = -EAGAIN; - } - - /* - * This means that the budgeting is incorrect. We always have - * to be able to write to the media, because all operations are - * budgeted. Deletions are not budgeted, though, but we reserve - * an extra LEB for them. - */ + nospc_retries++; + dbg_jnl("no space, retry"); + err = -EAGAIN; } if (err != -EAGAIN) @@ -349,15 +433,37 @@ again: */ if (cmt_retries > 128) { /* - * This should not happen unless the journal size limitations - * are too tough. + * This should not happen unless: + * 1. The journal size limitations are too tough. + * 2. The budgeting is incorrect. We always have to be able to + * write to the media, because all operations are budgeted. + * Deletions are not budgeted, though, but we reserve an + * extra LEB for them. */ - ubifs_err(c, "stuck in space allocation"); + ubifs_err(c, "stuck in space allocation, nospc_retries %d", + nospc_retries); err = -ENOSPC; goto out; - } else if (cmt_retries > 32) - ubifs_warn(c, "too many space allocation re-tries (%d)", - cmt_retries); + } else if (cmt_retries > 32) { + /* + * It's almost impossible to happen, unless there are many tasks + * making reservation concurrently and someone task has retried + * gc + commit for many times, generated available space during + * this period are grabbed by other tasks. + * But if it happens, start queuing up all tasks that will make + * space reservation, then there is only one task making space + * reservation at any time, and it can always make success under + * the premise of correct budgeting. + */ + ubifs_warn(c, "too many space allocation cmt_retries (%d) " + "nospc_retries (%d), start queuing tasks", + cmt_retries, nospc_retries); + + if (!blocked) { + blocked = true; + add_or_start_queue(c); + } + } dbg_jnl("-EAGAIN, commit and retry (retried %d times)", cmt_retries); @@ -365,7 +471,7 @@ again: err = ubifs_run_commit(c); if (err) - return err; + goto out_wake_up; goto again; out: @@ -380,6 +486,27 @@ out: cmt_retries = dbg_check_lprops(c); up_write(&c->commit_sem); } +out_wake_up: + if (blocked) { + /* + * Only tasks that have ever started queuing or ever been queued + * can wake up other queued tasks, which can make sure that + * there is only one task waked up to make space reservation. + * For example: + * task A task B task C + * make_reservation make_reservation + * reserve_space // 0 + * wake_up_reservation + * atomic_cmpxchg // 0, start queuing + * reserve_space + * wait_for_reservation + * __queue_and_wait + * add_wait_queue + * if (blocked) // false + * // So that task C won't be waked up to race with task B + */ + wake_up_reservation(c); + } return err; } diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 6d6cd85c2b4c..a11c3dab7e16 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1014,8 +1014,9 @@ out: */ static int scan_check_cb(struct ubifs_info *c, const struct ubifs_lprops *lp, int in_tree, - struct ubifs_lp_stats *lst) + void *arg) { + struct ubifs_lp_stats *lst = arg; struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; @@ -1269,8 +1270,7 @@ int dbg_check_lprops(struct ubifs_info *c) memset(&lst, 0, sizeof(struct ubifs_lp_stats)); err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, - (ubifs_lpt_scan_callback)scan_check_cb, - &lst); + scan_check_cb, &lst); if (err && err != -ENOSPC) goto out; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index c4d079328b92..07351fdce722 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1646,7 +1646,6 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) len -= node_len; } - err = 0; out: vfree(buf); return err; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 7f4031a15f4d..291583005dd1 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2151,6 +2151,8 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) mutex_init(&c->bu_mutex); mutex_init(&c->write_reserve_mutex); init_waitqueue_head(&c->cmt_wq); + init_waitqueue_head(&c->reserve_space_wq); + atomic_set(&c->need_wait_space, 0); c->buds = RB_ROOT; c->old_idx = RB_ROOT; c->size_tree = RB_ROOT; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index f4728e65d1bd..45cacdcd4746 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -3116,14 +3116,7 @@ static void tnc_destroy_cnext(struct ubifs_info *c) void ubifs_tnc_close(struct ubifs_info *c) { tnc_destroy_cnext(c); - if (c->zroot.znode) { - long n, freed; - - n = atomic_long_read(&c->clean_zn_cnt); - freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode); - ubifs_assert(c, freed == n); - atomic_long_sub(n, &ubifs_clean_zn_cnt); - } + ubifs_destroy_tnc_tree(c); kfree(c->gap_lebs); kfree(c->ilebs); destroy_old_idx(c); diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index 4d686e34e64d..d3f8a6aa1f49 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -251,6 +251,28 @@ long ubifs_destroy_tnc_subtree(const struct ubifs_info *c, } /** + * ubifs_destroy_tnc_tree - destroy all znodes connected to the TNC tree. + * @c: UBIFS file-system description object + * + * This function destroys the whole TNC tree and updates clean global znode + * count. + */ +void ubifs_destroy_tnc_tree(struct ubifs_info *c) +{ + long n, freed; + + if (!c->zroot.znode) + return; + + n = atomic_long_read(&c->clean_zn_cnt); + freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode); + ubifs_assert(c, freed == n); + atomic_long_sub(n, &ubifs_clean_zn_cnt); + + c->zroot.znode = NULL; +} + +/** * read_znode - read an indexing node from flash and fill znode. * @c: UBIFS file-system description object * @zzbr: the zbranch describing the node to read diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 3916dc4f30ca..1f3ea879d93a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1047,6 +1047,8 @@ struct ubifs_debug_info; * @bg_bud_bytes: number of bud bytes when background commit is initiated * @old_buds: buds to be released after commit ends * @max_bud_cnt: maximum number of buds + * @need_wait_space: Non %0 means space reservation tasks need to wait in queue + * @reserve_space_wq: wait queue to sleep on if @need_wait_space is not %0 * * @commit_sem: synchronizes committer with other processes * @cmt_state: commit state @@ -1305,6 +1307,8 @@ struct ubifs_info { long long bg_bud_bytes; struct list_head old_buds; int max_bud_cnt; + atomic_t need_wait_space; + wait_queue_head_t reserve_space_wq; struct rw_semaphore commit_sem; int cmt_state; @@ -1903,6 +1907,7 @@ struct ubifs_znode *ubifs_tnc_postorder_next(const struct ubifs_info *c, struct ubifs_znode *znode); long ubifs_destroy_tnc_subtree(const struct ubifs_info *c, struct ubifs_znode *zr); +void ubifs_destroy_tnc_tree(struct ubifs_info *c); struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr, struct ubifs_znode *parent, int iip); diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 8ad38c64708e..9bb2d24de709 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -81,8 +81,6 @@ xmbuf_alloc( /* ensure all writes are below EOF to avoid pagecache zeroing */ i_size_write(inode, inode->i_sb->s_maxbytes); - trace_xmbuf_create(btp); - error = xfs_buf_cache_init(btp->bt_cache); if (error) goto out_file; @@ -99,6 +97,8 @@ xmbuf_alloc( if (error) goto out_bcache; + trace_xmbuf_create(btp); + *btpp = btp; return 0; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 30d36596a2e4..c98cb468c357 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -811,6 +811,12 @@ restart: * caller should throw away the dquot and start over. Otherwise, the dquot * is returned locked (and held by the cache) as if there had been a cache * hit. + * + * The insert needs to be done under memalloc_nofs context because the radix + * tree can do memory allocation during insert. The qi->qi_tree_lock is taken in + * memory reclaim when freeing unused dquots, so we cannot have the radix tree + * node allocation recursing into filesystem reclaim whilst we hold the + * qi_tree_lock. */ static int xfs_qm_dqget_cache_insert( @@ -820,25 +826,27 @@ xfs_qm_dqget_cache_insert( xfs_dqid_t id, struct xfs_dquot *dqp) { + unsigned int nofs_flags; int error; + nofs_flags = memalloc_nofs_save(); mutex_lock(&qi->qi_tree_lock); error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { /* Duplicate found! Caller must try again. */ - mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_dup(dqp); - return error; + goto out_unlock; } /* Return a locked dquot to the caller, with a reference taken. */ xfs_dqlock(dqp); dqp->q_nrefs = 1; - qi->qi_dquots++; - mutex_unlock(&qi->qi_tree_lock); - return 0; +out_unlock: + mutex_unlock(&qi->qi_tree_lock); + memalloc_nofs_restore(nofs_flags); + return error; } /* Check our input parameters. */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 56b07d8ed431..aea97fc074f8 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4626,6 +4626,7 @@ TRACE_EVENT(xmbuf_create, char *path; struct file *file = btp->bt_file; + __entry->dev = btp->bt_mount->m_super->s_dev; __entry->ino = file_inode(file)->i_ino; memset(pathname, 0, sizeof(pathname)); path = file_path(file, pathname, sizeof(pathname) - 1); @@ -4633,7 +4634,8 @@ TRACE_EVENT(xmbuf_create, path = "(unknown)"; strncpy(__entry->pathname, path, sizeof(__entry->pathname)); ), - TP_printk("xmino 0x%lx path '%s'", + TP_printk("dev %d:%d xmino 0x%lx path '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pathname) ); @@ -4642,6 +4644,7 @@ TRACE_EVENT(xmbuf_free, TP_PROTO(struct xfs_buftarg *btp), TP_ARGS(btp), TP_STRUCT__entry( + __field(dev_t, dev) __field(unsigned long, ino) __field(unsigned long long, bytes) __field(loff_t, size) @@ -4650,11 +4653,13 @@ TRACE_EVENT(xmbuf_free, struct file *file = btp->bt_file; struct inode *inode = file_inode(file); + __entry->dev = btp->bt_mount->m_super->s_dev; __entry->size = i_size_read(inode); __entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes; __entry->ino = inode->i_ino; ), - TP_printk("xmino 0x%lx mem_bytes 0x%llx isize 0x%llx", + TP_printk("dev %d:%d xmino 0x%lx mem_bytes 0x%llx isize 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->bytes, __entry->size) |