diff options
Diffstat (limited to 'fs')
52 files changed, 515 insertions, 317 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 9774588da60e..bc821a86d965 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -58,6 +58,13 @@ config FS_DAX_PMD depends on ZONE_DEVICE depends on TRANSPARENT_HUGEPAGE +# Selected by DAX drivers that do not expect filesystem DAX to support +# get_user_pages() of DAX mappings. I.e. "limited" indicates no support +# for fork() of processes with MAP_SHARED mappings or support for +# direct-I/O to a DAX mapping. +config FS_DAX_LIMITED + bool + endif # BLOCK # Posix ACL utility routines diff --git a/fs/affs/dir.c b/fs/affs/dir.c index d180b46453cf..b2bf7016e1b3 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -81,7 +81,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) * we can jump directly to where we left off. */ ino = (u32)(long)file->private_data; - if (ino && inode_cmp_iversion(inode, file->f_version) == 0) { + if (ino && inode_eq_iversion(inode, file->f_version)) { pr_debug("readdir() left off=%d\n", ino); goto inside; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 83732fef510d..bdb201230bae 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1599,6 +1599,8 @@ static int fill_files_note(struct memelfnote *note) /* *Estimated* file count and total data size needed */ count = current->mm->map_count; + if (count > UINT_MAX / 64) + return -EINVAL; size = count * 64; names_ofs = (2 + 3 * count) * sizeof(data[0]); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index dbf07051aacd..b4336b42ce3b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -299,7 +299,8 @@ unlock: * start an async read(ahead) operation. return nr_pages we submitted * a read for on success, or negative error code. */ -static int start_read(struct inode *inode, struct list_head *page_list, int max) +static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, + struct list_head *page_list, int max) { struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; @@ -316,7 +317,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) int got = 0; int ret = 0; - if (!current->journal_info) { + if (!rw_ctx) { /* caller of readpages does not hold buffer and read caps * (fadvise, madvise and readahead cases) */ int want = CEPH_CAP_FILE_CACHE; @@ -437,6 +438,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, { struct inode *inode = file_inode(file); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_file_info *ci = file->private_data; + struct ceph_rw_context *rw_ctx; int rc = 0; int max = 0; @@ -449,11 +452,12 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc == 0) goto out; + rw_ctx = ceph_find_rw_context(ci); max = fsc->mount_options->rsize >> PAGE_SHIFT; - dout("readpages %p file %p nr_pages %d max %d\n", - inode, file, nr_pages, max); + dout("readpages %p file %p ctx %p nr_pages %d max %d\n", + inode, file, rw_ctx, nr_pages, max); while (!list_empty(page_list)) { - rc = start_read(inode, page_list, max); + rc = start_read(inode, rw_ctx, page_list, max); if (rc < 0) goto out; } @@ -574,7 +578,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_fs_client *fsc; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); - long writeback_stat; int err, len = PAGE_SIZE; struct ceph_writeback_ctl ceph_wbc; @@ -615,8 +618,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", inode, page, page->index, page_off, len, snapc, snapc->seq); - writeback_stat = atomic_long_inc_return(&fsc->writeback_count); - if (writeback_stat > + if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); @@ -651,6 +653,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) end_page_writeback(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ + + if (atomic_long_dec_return(&fsc->writeback_count) < + CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) + clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + return err; } @@ -1450,9 +1457,10 @@ static int ceph_filemap_fault(struct vm_fault *vmf) if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || ci->i_inline_version == CEPH_INLINE_NONE) { - current->journal_info = vma->vm_file; + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); + ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); - current->journal_info = NULL; + ceph_del_rw_context(fi, &rw_ctx); } else ret = -EAGAIN; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a14b2c974c9e..6582c4507e6c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -154,13 +154,19 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) spin_unlock(&mdsc->caps_list_lock); } -void ceph_reserve_caps(struct ceph_mds_client *mdsc, +/* + * Called under mdsc->mutex. + */ +int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { - int i; + int i, j; struct ceph_cap *cap; int have; int alloc = 0; + int max_caps; + bool trimmed = false; + struct ceph_mds_session *s; LIST_HEAD(newcaps); dout("reserve caps ctx=%p need=%d\n", ctx, need); @@ -179,16 +185,37 @@ void ceph_reserve_caps(struct ceph_mds_client *mdsc, spin_unlock(&mdsc->caps_list_lock); for (i = have; i < need; i++) { +retry: cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); - if (!cap) - break; + if (!cap) { + if (!trimmed) { + for (j = 0; j < mdsc->max_sessions; j++) { + s = __ceph_lookup_mds_session(mdsc, j); + if (!s) + continue; + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + max_caps = s->s_nr_caps - (need - i); + ceph_trim_caps(mdsc, s, max_caps); + mutex_unlock(&s->s_mutex); + + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + trimmed = true; + goto retry; + } else { + pr_warn("reserve caps ctx=%p ENOMEM " + "need=%d got=%d\n", + ctx, need, have + alloc); + goto out_nomem; + } + } list_add(&cap->caps_item, &newcaps); alloc++; } - /* we didn't manage to reserve as much as we needed */ - if (have + alloc != need) - pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", - ctx, need, have + alloc); + BUG_ON(have + alloc != need); spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; @@ -204,6 +231,24 @@ void ceph_reserve_caps(struct ceph_mds_client *mdsc, dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); + return 0; + +out_nomem: + while (!list_empty(&newcaps)) { + cap = list_first_entry(&newcaps, + struct ceph_cap, caps_item); + list_del(&cap->caps_item); + kmem_cache_free(ceph_cap_cachep, cap); + } + + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_avail_count += have; + mdsc->caps_reserve_count -= have; + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); + return -ENOMEM; } int ceph_unreserve_caps(struct ceph_mds_client *mdsc, @@ -498,7 +543,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, */ if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { if (issued & CEPH_CAP_FILE_SHARED) - ci->i_shared_gen++; + atomic_inc(&ci->i_shared_gen); if (S_ISDIR(ci->vfs_inode.i_mode)) { dout(" marking %p NOT complete\n", &ci->vfs_inode); __ceph_dir_clear_complete(ci); @@ -577,18 +622,30 @@ void ceph_add_cap(struct inode *inode, } } - if (!ci->i_snap_realm) { + if (!ci->i_snap_realm || + ((flags & CEPH_CAP_FLAG_AUTH) && + realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) { /* * add this inode to the appropriate snap realm */ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); if (realm) { + struct ceph_snap_realm *oldrealm = ci->i_snap_realm; + if (oldrealm) { + spin_lock(&oldrealm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + spin_unlock(&oldrealm->inodes_with_caps_lock); + } + spin_lock(&realm->inodes_with_caps_lock); ci->i_snap_realm = realm; list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); spin_unlock(&realm->inodes_with_caps_lock); + + if (oldrealm) + ceph_put_snap_realm(mdsc, oldrealm); } else { pr_err("ceph_add_cap: couldn't find snap realm %llx\n", realmino); @@ -890,6 +947,11 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) /* * called under i_ceph_lock */ +static int __ceph_is_single_caps(struct ceph_inode_info *ci) +{ + return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); +} + static int __ceph_is_any_caps(struct ceph_inode_info *ci) { return !RB_EMPTY_ROOT(&ci->i_caps); @@ -1703,21 +1765,24 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ struct rb_node *p; - int delayed = 0, sent = 0, num; - bool is_delayed = flags & CHECK_CAPS_NODELAY; + int delayed = 0, sent = 0; + bool no_delay = flags & CHECK_CAPS_NODELAY; bool queue_invalidate = false; - bool force_requeue = false; bool tried_invalidate = false; /* if we are unmounting, flush any unused caps immediately. */ if (mdsc->stopping) - is_delayed = true; + no_delay = true; spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; + if (!(flags & CHECK_CAPS_AUTHONLY) || + (ci->i_auth_cap && __ceph_is_single_caps(ci))) + __cap_delay_cancel(mdsc, ci); + goto retry_locked; retry: spin_lock(&ci->i_ceph_lock); @@ -1772,7 +1837,7 @@ retry_locked: * have cached pages, but don't want them, then try to invalidate. * If we fail, it's because pages are locked.... try again later. */ - if ((!is_delayed || mdsc->stopping) && + if ((!no_delay || mdsc->stopping) && !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ @@ -1781,27 +1846,16 @@ retry_locked: !tried_invalidate) { dout("check_caps trying to invalidate on %p\n", inode); if (try_nonblocking_invalidate(inode) < 0) { - if (revoking & (CEPH_CAP_FILE_CACHE| - CEPH_CAP_FILE_LAZYIO)) { - dout("check_caps queuing invalidate\n"); - queue_invalidate = true; - ci->i_rdcache_revoking = ci->i_rdcache_gen; - } else { - dout("check_caps failed to invalidate pages\n"); - /* we failed to invalidate pages. check these - caps again later. */ - force_requeue = true; - __cap_set_timeouts(mdsc, ci); - } + dout("check_caps queuing invalidate\n"); + queue_invalidate = true; + ci->i_rdcache_revoking = ci->i_rdcache_gen; } tried_invalidate = true; goto retry_locked; } - num = 0; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - num++; /* avoid looping forever */ if (mds >= cap->mds || @@ -1864,7 +1918,7 @@ retry_locked: cap->mds_wanted == want) continue; /* nope, all good */ - if (is_delayed) + if (no_delay) goto ack; /* delay? */ @@ -1955,15 +2009,8 @@ ack: goto retry; /* retake i_ceph_lock and restart our cap scan. */ } - /* - * Reschedule delayed caps release if we delayed anything, - * otherwise cancel. - */ - if (delayed && is_delayed) - force_requeue = true; /* __send_cap delayed release; requeue */ - if (!delayed && !is_delayed) - __cap_delay_cancel(mdsc, ci); - else if (!is_delayed || force_requeue) + /* Reschedule delayed caps release if we delayed anything */ + if (delayed) __cap_delay_requeue(mdsc, ci); spin_unlock(&ci->i_ceph_lock); @@ -2160,7 +2207,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) u64 flush_tid; int err = 0; int dirty; - int wait = wbc->sync_mode == WB_SYNC_ALL; + int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); dout("write_inode %p wait=%d\n", inode, wait); if (wait) { @@ -3426,7 +3473,14 @@ retry: */ issued = cap->issued; - WARN_ON(issued != cap->implemented); + if (issued != cap->implemented) + pr_err_ratelimited("handle_cap_export: issued != implemented: " + "ino (%llx.%llx) mds%d seq %d mseq %d " + "issued %s implemented %s\n", + ceph_vinop(inode), mds, cap->seq, cap->mseq, + ceph_cap_string(issued), + ceph_cap_string(cap->implemented)); + tcap = __get_cap_for_mds(ci, target); if (tcap) { @@ -3572,12 +3626,13 @@ retry: if ((ph->flags & CEPH_CAP_FLAG_AUTH) && (ocap->seq != le32_to_cpu(ph->seq) || ocap->mseq != le32_to_cpu(ph->mseq))) { - pr_err("handle_cap_import: mismatched seq/mseq: " - "ino (%llx.%llx) mds%d seq %d mseq %d " - "importer mds%d has peer seq %d mseq %d\n", - ceph_vinop(inode), peer, ocap->seq, - ocap->mseq, mds, le32_to_cpu(ph->seq), - le32_to_cpu(ph->mseq)); + pr_err_ratelimited("handle_cap_import: " + "mismatched seq/mseq: ino (%llx.%llx) " + "mds%d seq %d mseq %d importer mds%d " + "has peer seq %d mseq %d\n", + ceph_vinop(inode), peer, ocap->seq, + ocap->mseq, mds, le32_to_cpu(ph->seq), + le32_to_cpu(ph->mseq)); } __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } @@ -3939,11 +3994,20 @@ int ceph_encode_inode_release(void **p, struct inode *inode, cap = __get_cap_for_mds(ci, mds); if (cap && __cap_is_valid(cap)) { - if (force || - ((cap->issued & drop) && - (cap->issued & unless) == 0)) { - if ((cap->issued & drop) && - (cap->issued & unless) == 0) { + unless &= cap->issued; + if (unless) { + if (unless & CEPH_CAP_AUTH_EXCL) + drop &= ~CEPH_CAP_AUTH_SHARED; + if (unless & CEPH_CAP_LINK_EXCL) + drop &= ~CEPH_CAP_LINK_SHARED; + if (unless & CEPH_CAP_XATTR_EXCL) + drop &= ~CEPH_CAP_XATTR_SHARED; + if (unless & CEPH_CAP_FILE_EXCL) + drop &= ~CEPH_CAP_FILE_SHARED; + } + + if (force || (cap->issued & drop)) { + if (cap->issued & drop) { int wanted = __ceph_caps_wanted(ci); if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) wanted |= cap->mds_wanted; @@ -3975,7 +4039,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, *p += sizeof(*rel); ret = 1; } else { - dout("encode_inode_release %p cap %p %s\n", + dout("encode_inode_release %p cap %p %s (noop)\n", inode, cap, ceph_cap_string(cap->issued)); } } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8a5266699b67..0c4346806e17 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -173,7 +173,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, * the MDS if/when the directory is modified). */ static int __dcache_readdir(struct file *file, struct dir_context *ctx, - u32 shared_gen) + int shared_gen) { struct ceph_file_info *fi = file->private_data; struct dentry *parent = file->f_path.dentry; @@ -184,7 +184,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, u64 idx = 0; int err = 0; - dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); + dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos); /* search start position */ if (ctx->pos > 2) { @@ -231,11 +231,17 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, goto out; } - di = ceph_dentry(dentry); spin_lock(&dentry->d_lock); - if (di->lease_shared_gen == shared_gen && - d_really_is_positive(dentry) && - fpos_cmp(ctx->pos, di->offset) <= 0) { + di = ceph_dentry(dentry); + if (d_unhashed(dentry) || + d_really_is_negative(dentry) || + di->lease_shared_gen != shared_gen) { + spin_unlock(&dentry->d_lock); + dput(dentry); + err = -EAGAIN; + goto out; + } + if (fpos_cmp(ctx->pos, di->offset) <= 0) { emit_dentry = true; } spin_unlock(&dentry->d_lock); @@ -333,7 +339,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete_ordered(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { - u32 shared_gen = ci->i_shared_gen; + int shared_gen = atomic_read(&ci->i_shared_gen); spin_unlock(&ci->i_ceph_lock); err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) @@ -381,6 +387,7 @@ more: if (op == CEPH_MDS_OP_READDIR) { req->r_direct_hash = ceph_frag_value(frag); __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + req->r_inode_drop = CEPH_CAP_FILE_EXCL; } if (fi->last_name) { req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); @@ -750,7 +757,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); d_add(dentry, NULL); - di->lease_shared_gen = ci->i_shared_gen; + di->lease_shared_gen = atomic_read(&ci->i_shared_gen); return NULL; } spin_unlock(&ci->i_ceph_lock); @@ -835,7 +842,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mknod.mode = cpu_to_le32(mode); req->r_args.mknod.rdev = cpu_to_le32(rdev); - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; if (acls.pagelist) { req->r_pagelist = acls.pagelist; @@ -887,7 +894,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) @@ -936,7 +943,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) req->r_parent = dir; set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mkdir.mode = cpu_to_le32(mode); - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; if (acls.pagelist) { req->r_pagelist = acls.pagelist; @@ -983,7 +990,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_SHARED on source inode (mds will lock it) */ - req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; err = ceph_mdsc_do_request(mdsc, dir, req); if (err) { d_drop(dentry); @@ -1096,7 +1103,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_RDCACHE on source inode (mds will lock it) */ - req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; if (d_really_is_positive(new_dentry)) req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); err = ceph_mdsc_do_request(mdsc, old_dir, req); @@ -1106,16 +1113,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * do_request, above). If there is no trace, we need * to do it here. */ - - /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_complete(old_dir); - ceph_dir_clear_complete(new_dir); - d_move(old_dentry, new_dentry); - - /* ensure target dentry is invalidated, despite - rehashing bug in vfs_rename_dir */ - ceph_invalidate_dentry_lease(new_dentry); } ceph_mdsc_put_request(req); return err; @@ -1199,12 +1197,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) int valid = 0; spin_lock(&ci->i_ceph_lock); - if (ci->i_shared_gen == di->lease_shared_gen) + if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen) valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); spin_unlock(&ci->i_ceph_lock); dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", - dir, (unsigned)ci->i_shared_gen, dentry, - (unsigned)di->lease_shared_gen, valid); + dir, (unsigned)atomic_read(&ci->i_shared_gen), + dentry, (unsigned)di->lease_shared_gen, valid); return valid; } @@ -1332,24 +1330,37 @@ static void ceph_d_release(struct dentry *dentry) */ static void ceph_d_prune(struct dentry *dentry) { - dout("ceph_d_prune %p\n", dentry); + struct ceph_inode_info *dir_ci; + struct ceph_dentry_info *di; + + dout("ceph_d_prune %pd %p\n", dentry, dentry); /* do we have a valid parent? */ if (IS_ROOT(dentry)) return; - /* if we are not hashed, we don't affect dir's completeness */ - if (d_unhashed(dentry)) + /* we hold d_lock, so d_parent is stable */ + dir_ci = ceph_inode(d_inode(dentry->d_parent)); + if (dir_ci->i_vino.snap == CEPH_SNAPDIR) return; - if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) + /* who calls d_delete() should also disable dcache readdir */ + if (d_really_is_negative(dentry)) return; - /* - * we hold d_lock, so d_parent is stable, and d_fsdata is never - * cleared until d_release - */ - ceph_dir_clear_complete(d_inode(dentry->d_parent)); + /* d_fsdata does not get cleared until d_release */ + if (!d_unhashed(dentry)) { + __ceph_dir_clear_complete(dir_ci); + return; + } + + /* Disable dcache readdir just in case that someone called d_drop() + * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED + * properly (dcache readdir is still enabled) */ + di = ceph_dentry(dentry); + if (di->offset > 0 && + di->lease_shared_gen == atomic_read(&dir_ci->i_shared_gen)) + __ceph_dir_clear_ordered(dir_ci); } /* diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5c17125f45c7..6639926eed4e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -181,6 +181,10 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) return -ENOMEM; } cf->fmode = fmode; + + spin_lock_init(&cf->rw_contexts_lock); + INIT_LIST_HEAD(&cf->rw_contexts); + cf->next_offset = 2; cf->readdir_cache_idx = -1; file->private_data = cf; @@ -396,7 +400,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, req->r_dentry = dget(dentry); req->r_num_caps = 2; if (flags & O_CREAT) { - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; if (acls.pagelist) { req->r_pagelist = acls.pagelist; @@ -464,6 +468,7 @@ int ceph_release(struct inode *inode, struct file *file) ceph_mdsc_put_request(cf->last_readdir); kfree(cf->last_name); kfree(cf->dir_info); + WARN_ON(!list_empty(&cf->rw_contexts)); kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ @@ -1199,12 +1204,13 @@ again: retry_op = READ_INLINE; } } else { + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - current->journal_info = filp; + ceph_add_rw_context(fi, &rw_ctx); ret = generic_file_read_iter(iocb, to); - current->journal_info = NULL; + ceph_del_rw_context(fi, &rw_ctx); } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ab81652198c4..c6ec5aa46100 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -494,7 +494,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; atomic_set(&ci->i_filelock_ref, 0); - ci->i_shared_gen = 0; + atomic_set(&ci->i_shared_gen, 0); ci->i_rdcache_gen = 0; ci->i_rdcache_revoking = 0; @@ -1041,7 +1041,7 @@ static void update_dentry_lease(struct dentry *dentry, if (ceph_snap(dir) != CEPH_NOSNAP) goto out_unlock; - di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; + di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); if (duration == 0) goto out_unlock; @@ -1080,6 +1080,27 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in) BUG_ON(d_inode(dn)); + if (S_ISDIR(in->i_mode)) { + /* If inode is directory, d_splice_alias() below will remove + * 'realdn' from its origin parent. We need to ensure that + * origin parent's readdir cache will not reference 'realdn' + */ + realdn = d_find_any_alias(in); + if (realdn) { + struct ceph_dentry_info *di = ceph_dentry(realdn); + spin_lock(&realdn->d_lock); + + realdn->d_op->d_prune(realdn); + + di->time = jiffies; + di->lease_shared_gen = 0; + di->offset = 0; + + spin_unlock(&realdn->d_lock); + dput(realdn); + } + } + /* dn must be unhashed */ if (!d_unhashed(dn)) d_drop(dn); @@ -1295,8 +1316,8 @@ retry_lookup: if (!rinfo->head->is_target) { dout("fill_trace null dentry\n"); if (d_really_is_positive(dn)) { - ceph_dir_clear_ordered(dir); dout("d_delete %p\n", dn); + ceph_dir_clear_ordered(dir); d_delete(dn); } else if (have_lease) { if (d_unhashed(dn)) @@ -1323,7 +1344,6 @@ retry_lookup: dout(" %p links to %p %llx.%llx, not %llx.%llx\n", dn, d_inode(dn), ceph_vinop(d_inode(dn)), ceph_vinop(in)); - ceph_dir_clear_ordered(dir); d_invalidate(dn); have_lease = false; } @@ -1573,9 +1593,19 @@ retry_lookup: } else if (d_really_is_positive(dn) && (ceph_ino(d_inode(dn)) != tvino.ino || ceph_snap(d_inode(dn)) != tvino.snap)) { + struct ceph_dentry_info *di = ceph_dentry(dn); dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); - __ceph_dir_clear_ordered(ci); + + spin_lock(&dn->d_lock); + if (di->offset > 0 && + di->lease_shared_gen == + atomic_read(&ci->i_shared_gen)) { + __ceph_dir_clear_ordered(ci); + di->offset = 0; + } + spin_unlock(&dn->d_lock); + d_delete(dn); dput(dn); goto retry_lookup; @@ -1600,9 +1630,7 @@ retry_lookup: &req->r_caps_reservation); if (ret < 0) { pr_err("fill_inode badness on %p\n", in); - if (d_really_is_positive(dn)) - __ceph_dir_clear_ordered(ci); - else + if (d_really_is_negative(dn)) iput(in); d_drop(dn); err = ret; @@ -2000,8 +2028,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) ceph_encode_timespec(&req->r_args.setattr.atime, &attr->ia_atime); mask |= CEPH_SETATTR_ATIME; - release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | - CEPH_CAP_FILE_WR; + release |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } if (ia_valid & ATTR_MTIME) { @@ -2022,8 +2050,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) ceph_encode_timespec(&req->r_args.setattr.mtime, &attr->ia_mtime); mask |= CEPH_SETATTR_MTIME; - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | - CEPH_CAP_FILE_WR; + release |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } if (ia_valid & ATTR_SIZE) { @@ -2041,8 +2069,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_args.setattr.old_size = cpu_to_le64(inode->i_size); mask |= CEPH_SETATTR_SIZE; - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | - CEPH_CAP_FILE_WR; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1b468250e947..2e8f90f96540 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -604,10 +604,20 @@ static void __register_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, struct inode *dir) { + int ret = 0; + req->r_tid = ++mdsc->last_tid; - if (req->r_num_caps) - ceph_reserve_caps(mdsc, &req->r_caps_reservation, - req->r_num_caps); + if (req->r_num_caps) { + ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation, + req->r_num_caps); + if (ret < 0) { + pr_err("__register_request %p " + "failed to reserve caps: %d\n", req, ret); + /* set req->r_err to fail early from __do_request */ + req->r_err = ret; + return; + } + } dout("__register_request %p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); insert_request(&mdsc->request_tree, req); @@ -1545,9 +1555,9 @@ out: /* * Trim session cap count down to some max number. */ -static int trim_caps(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - int max_caps) +int ceph_trim_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int max_caps) { int trim_caps = session->s_nr_caps - max_caps; @@ -2438,11 +2448,14 @@ out: */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { - struct inode *inode = req->r_parent; + struct inode *dir = req->r_parent; + struct inode *old_dir = req->r_old_dentry_dir; - dout("invalidate_dir_request %p (complete, lease(s))\n", inode); + dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir); - ceph_dir_clear_complete(inode); + ceph_dir_clear_complete(dir); + if (old_dir) + ceph_dir_clear_complete(old_dir); if (req->r_dentry) ceph_invalidate_dentry_lease(req->r_dentry); if (req->r_old_dentry) @@ -2773,7 +2786,7 @@ static void handle_session(struct ceph_mds_session *session, break; case CEPH_SESSION_RECALL_STATE: - trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); + ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); break; case CEPH_SESSION_FLUSHMSG: diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 837ac4b087a0..71e3b783ee6f 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -444,4 +444,7 @@ ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); +extern int ceph_trim_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int max_caps); #endif diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 8a2ca41e4b97..07cf95e6413d 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -922,13 +922,17 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, /* * Move the inode to the new realm */ - spin_lock(&realm->inodes_with_caps_lock); + oldrealm = ci->i_snap_realm; + spin_lock(&oldrealm->inodes_with_caps_lock); list_del_init(&ci->i_snap_realm_item); + spin_unlock(&oldrealm->inodes_with_caps_lock); + + spin_lock(&realm->inodes_with_caps_lock); list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); - oldrealm = ci->i_snap_realm; ci->i_snap_realm = realm; spin_unlock(&realm->inodes_with_caps_lock); + spin_unlock(&ci->i_ceph_lock); ceph_get_snap_realm(mdsc, realm); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2beeec07fa76..21b2e5b004eb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -256,7 +256,8 @@ struct ceph_inode_xattr { */ struct ceph_dentry_info { struct ceph_mds_session *lease_session; - u32 lease_gen, lease_shared_gen; + int lease_shared_gen; + u32 lease_gen; u32 lease_seq; unsigned long lease_renew_after, lease_renew_from; struct list_head lru; @@ -353,7 +354,7 @@ struct ceph_inode_info { int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; atomic_t i_filelock_ref; - u32 i_shared_gen; /* increment each time we get FILE_SHARED */ + atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ @@ -648,7 +649,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); -extern void ceph_reserve_caps(struct ceph_mds_client *mdsc, +extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); @@ -668,6 +669,9 @@ struct ceph_file_info { short fmode; /* initialized on open */ short flags; /* CEPH_F_* */ + spinlock_t rw_contexts_lock; + struct list_head rw_contexts; + /* readdir: position within the dir */ u32 frag; struct ceph_mds_request *last_readdir; @@ -684,6 +688,49 @@ struct ceph_file_info { int dir_info_len; }; +struct ceph_rw_context { + struct list_head list; + struct task_struct *thread; + int caps; +}; + +#define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \ + struct ceph_rw_context _name = { \ + .thread = current, \ + .caps = _caps, \ + } + +static inline void ceph_add_rw_context(struct ceph_file_info *cf, + struct ceph_rw_context *ctx) +{ + spin_lock(&cf->rw_contexts_lock); + list_add(&ctx->list, &cf->rw_contexts); + spin_unlock(&cf->rw_contexts_lock); +} + +static inline void ceph_del_rw_context(struct ceph_file_info *cf, + struct ceph_rw_context *ctx) +{ + spin_lock(&cf->rw_contexts_lock); + list_del(&ctx->list); + spin_unlock(&cf->rw_contexts_lock); +} + +static inline struct ceph_rw_context* +ceph_find_rw_context(struct ceph_file_info *cf) +{ + struct ceph_rw_context *ctx, *found = NULL; + spin_lock(&cf->rw_contexts_lock); + list_for_each_entry(ctx, &cf->rw_contexts, list) { + if (ctx->thread == current) { + found = ctx; + break; + } + } + spin_unlock(&cf->rw_contexts_lock); + return found; +} + struct ceph_readdir_cache_control { struct page *page; struct dentry **dentries; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 5fc5dc660600..ef80085ed564 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1218,23 +1218,11 @@ COMPATIBLE_IOCTL(DMX_SET_PES_FILTER) COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE) COMPATIBLE_IOCTL(DMX_GET_PES_PIDS) COMPATIBLE_IOCTL(DMX_GET_STC) -COMPATIBLE_IOCTL(FE_GET_INFO) -COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD) -COMPATIBLE_IOCTL(FE_DISEQC_SEND_MASTER_CMD) -COMPATIBLE_IOCTL(FE_DISEQC_RECV_SLAVE_REPLY) -COMPATIBLE_IOCTL(FE_DISEQC_SEND_BURST) -COMPATIBLE_IOCTL(FE_SET_TONE) -COMPATIBLE_IOCTL(FE_SET_VOLTAGE) -COMPATIBLE_IOCTL(FE_ENABLE_HIGH_LNB_VOLTAGE) -COMPATIBLE_IOCTL(FE_READ_STATUS) -COMPATIBLE_IOCTL(FE_READ_BER) -COMPATIBLE_IOCTL(FE_READ_SIGNAL_STRENGTH) -COMPATIBLE_IOCTL(FE_READ_SNR) -COMPATIBLE_IOCTL(FE_READ_UNCORRECTED_BLOCKS) -COMPATIBLE_IOCTL(FE_SET_FRONTEND) -COMPATIBLE_IOCTL(FE_GET_FRONTEND) -COMPATIBLE_IOCTL(FE_GET_EVENT) -COMPATIBLE_IOCTL(FE_DISHNETWORK_SEND_LEGACY_CMD) +COMPATIBLE_IOCTL(DMX_REQBUFS) +COMPATIBLE_IOCTL(DMX_QUERYBUF) +COMPATIBLE_IOCTL(DMX_EXPBUF) +COMPATIBLE_IOCTL(DMX_QBUF) +COMPATIBLE_IOCTL(DMX_DQBUF) COMPATIBLE_IOCTL(VIDEO_STOP) COMPATIBLE_IOCTL(VIDEO_PLAY) COMPATIBLE_IOCTL(VIDEO_FREEZE) diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index 58e2fe40b2a0..5933f995309a 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -33,8 +33,7 @@ config CRAMFS_BLOCKDEV config CRAMFS_MTD bool "Support CramFs image directly mapped in physical memory" - depends on CRAMFS && MTD - depends on CRAMFS=m || MTD=y + depends on CRAMFS && CRAMFS <= MTD default y if !CRAMFS_BLOCKDEV help This option allows the CramFs driver to load data directly from diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index c5a53fcc43ea..f0138674c1ed 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -242,7 +242,7 @@ exofs_readdir(struct file *file, struct dir_context *ctx) unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); - bool need_revalidate = inode_cmp_iversion(inode, file->f_version); + bool need_revalidate = !inode_eq_iversion(inode, file->f_version); if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) return 0; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 4111085a129f..3b8114def693 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -294,7 +294,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); unsigned char *types = NULL; - bool need_revalidate = inode_cmp_iversion(inode, file->f_version); + bool need_revalidate = !inode_eq_iversion(inode, file->f_version); if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) return 0; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index db5f9daa7780..7666c065b96f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -962,8 +962,11 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { err = bdev_dax_supported(sb, blocksize); - if (err) - goto failed_mount; + if (err) { + ext2_msg(sb, KERN_ERR, + "DAX unsupported by block device. Turning off DAX."); + sbi->s_mount_opt &= ~EXT2_MOUNT_DAX; + } } /* If the blocksize doesn't match, re-read the thing.. */ @@ -1228,7 +1231,7 @@ static void ext2_clear_super_error(struct super_block *sb) * write and hope for the best. */ ext2_msg(sb, KERN_ERR, - "previous I/O error to superblock detected\n"); + "previous I/O error to superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index afda0a0499ce..da87cf757f7d 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -209,7 +209,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (inode_cmp_iversion(inode, file->f_version)) { + if (!inode_eq_iversion(inode, file->f_version)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); @@ -569,7 +569,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) * cached entries. */ if ((!info->curr_node) || - inode_cmp_iversion(inode, file->f_version)) { + !inode_eq_iversion(inode, file->f_version)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); file->f_version = inode_query_iversion(inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 7c4165b88505..70cf4c7b268a 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1487,7 +1487,7 @@ int ext4_read_inline_dir(struct file *file, * dirent right now. Scan from the start of the inline * dir to make sure. */ - if (inode_cmp_iversion(inode, file->f_version)) { + if (!inode_eq_iversion(inode, file->f_version)) { for (i = 0; i < extra_size && i < offset;) { /* * "." is with offset 0 and diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 421222ec3509..39bf464c35f1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3712,11 +3712,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); - goto failed_mount; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; } err = bdev_dax_supported(sb, blocksize); - if (err) - goto failed_mount; + if (err) { + ext4_msg(sb, KERN_ERR, + "DAX unsupported by block device. Turning off DAX."); + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; + } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index cefea792cde8..2649759c478a 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -46,7 +46,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) { int ret = 1; spin_lock(&dentry->d_lock); - if (inode_cmp_iversion(d_inode(dentry->d_parent), vfat_d_version(dentry))) + if (!inode_eq_iversion(d_inode(dentry->d_parent), vfat_d_version(dentry))) ret = 0; spin_unlock(&dentry->d_lock); return ret; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index e8120a282435..15e06fb552da 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -444,7 +444,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); - inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO); + inode = hfsplus_new_inode(dir->i_sb, dir, S_IFLNK | S_IRWXUGO); if (!inode) goto out; @@ -486,7 +486,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); - inode = hfsplus_new_inode(dir->i_sb, mode); + inode = hfsplus_new_inode(dir->i_sb, dir, mode); if (!inode) goto out; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a015044daa05..d9255abafb81 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -478,7 +478,8 @@ extern const struct address_space_operations hfsplus_aops; extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; -struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode); +struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, + umode_t mode); void hfsplus_delete_inode(struct inode *inode); void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 190c60efbc99..c0c8d433864f 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -354,7 +354,8 @@ static const struct file_operations hfsplus_file_operations = { .unlocked_ioctl = hfsplus_ioctl, }; -struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) +struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, + umode_t mode) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct inode *inode = new_inode(sb); @@ -364,9 +365,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) return NULL; inode->i_ino = sbi->next_cnid++; - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, dir, mode); set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 1d458b716957..513c357c734b 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -549,7 +549,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!sbi->hidden_dir) { mutex_lock(&sbi->vh_mutex); - sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); + sbi->hidden_dir = hfsplus_new_inode(sb, root, S_IFDIR); if (!sbi->hidden_dir) { mutex_unlock(&sbi->vh_mutex); err = -ENOMEM; diff --git a/fs/inode.c b/fs/inode.c index e2ca0f4b5151..ef362364d396 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -498,7 +498,6 @@ EXPORT_SYMBOL(__remove_inode_hash); void clear_inode(struct inode *inode) { - might_sleep(); /* * We have to cycle tree_lock here because reclaim can be still in the * process of removing the last page (in __delete_from_page_cache()) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index ceeaf0fb6657..7d893543cf3b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1314,7 +1314,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) - && !inode_cmp_iversion_raw(inode, fattr->pre_change_attr)) { + && inode_eq_iversion_raw(inode, fattr->pre_change_attr)) { inode_set_iversion_raw(inode, fattr->change_attr); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); @@ -1373,7 +1373,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode_cmp_iversion_raw(inode, fattr->change_attr)) + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) @@ -1803,7 +1803,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { - if (inode_cmp_iversion_raw(inode, fattr->change_attr)) { + if (!inode_eq_iversion_raw(inode, fattr->change_attr)) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); /* Could it be a race with writeback? */ diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6c5009cc4e6f..68cb9e4740b4 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -130,7 +130,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, } int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags, - time_t ctime, __u64 cno) + time64_t ctime, __u64 cno) { int err; diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 7bbccc099709..10e16935fff6 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -46,7 +46,7 @@ struct nilfs_segsum_info { unsigned long nfileblk; u64 seg_seq; __u64 cno; - time_t ctime; + time64_t ctime; sector_t next; }; @@ -120,7 +120,7 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, struct nilfs_segment_buffer *prev); void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, struct the_nilfs *); -int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t, +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time64_t, __u64); int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 9f3ffba41533..0953635e7d48 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2040,7 +2040,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) goto out; /* Update time stamp */ - sci->sc_seg_ctime = get_seconds(); + sci->sc_seg_ctime = ktime_get_real_seconds(); err = nilfs_segctor_collect(sci, nilfs, mode); if (unlikely(err)) diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 84084a4d9b3e..04634e3e3d58 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -157,7 +157,7 @@ struct nilfs_sc_info { unsigned long sc_blk_cnt; unsigned long sc_datablk_cnt; unsigned long sc_nblk_this_inc; - time_t sc_seg_ctime; + time64_t sc_seg_ctime; __u64 sc_cno; unsigned long sc_flags; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1341a41e7b43..c7fa139d50e8 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -526,7 +526,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) * @modtime: modification time (option) */ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, - unsigned long nblocks, time_t modtime) + unsigned long nblocks, time64_t modtime) { struct buffer_head *bh; struct nilfs_segment_usage *su; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 158a9190c8ec..673a891350f4 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -35,7 +35,7 @@ int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end); int nilfs_sufile_alloc(struct inode *, __u64 *); int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, - unsigned long nblocks, time_t modtime); + unsigned long nblocks, time64_t modtime); int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned int, size_t); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3073b646e1ba..6ffeca84d7c3 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -283,10 +283,10 @@ int nilfs_commit_super(struct super_block *sb, int flag) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_super_block **sbp = nilfs->ns_sbp; - time_t t; + time64_t t; /* nilfs->ns_sem must be locked by the caller. */ - t = get_seconds(); + t = ktime_get_real_seconds(); nilfs->ns_sbwtime = t; sbp[0]->s_wtime = cpu_to_le64(t); sbp[0]->s_sum = 0; diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 490303e3d517..4b25837e7724 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -31,7 +31,7 @@ static struct kset *nilfs_kset; #define NILFS_SHOW_TIME(time_t_val, buf) ({ \ struct tm res; \ int count = 0; \ - time_to_tm(time_t_val, 0, &res); \ + time64_to_tm(time_t_val, 0, &res); \ res.tm_year += 1900; \ res.tm_mon += 1; \ count = scnprintf(buf, PAGE_SIZE, \ @@ -579,7 +579,7 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t ctime; + time64_t ctime; down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; @@ -593,13 +593,13 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t ctime; + time64_t ctime; down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime); + return snprintf(buf, PAGE_SIZE, "%llu\n", ctime); } static ssize_t @@ -607,7 +607,7 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t nongc_ctime; + time64_t nongc_ctime; down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; @@ -621,14 +621,13 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t nongc_ctime; + time64_t nongc_ctime; down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)nongc_ctime); + return snprintf(buf, PAGE_SIZE, "%llu\n", nongc_ctime); } static ssize_t @@ -728,7 +727,7 @@ nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t sbwtime; + time64_t sbwtime; down_read(&nilfs->ns_sem); sbwtime = nilfs->ns_sbwtime; @@ -742,13 +741,13 @@ nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t sbwtime; + time64_t sbwtime; down_read(&nilfs->ns_sem); sbwtime = nilfs->ns_sbwtime; up_read(&nilfs->ns_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime); + return snprintf(buf, PAGE_SIZE, "%llu\n", sbwtime); } static ssize_t diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 883d732b0259..36da1779f976 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -116,7 +116,7 @@ struct the_nilfs { */ struct buffer_head *ns_sbh[2]; struct nilfs_super_block *ns_sbp[2]; - time_t ns_sbwtime; + time64_t ns_sbwtime; unsigned int ns_sbwcount; unsigned int ns_sbsize; unsigned int ns_mount_state; @@ -131,8 +131,8 @@ struct the_nilfs { __u64 ns_nextnum; unsigned long ns_pseg_offset; __u64 ns_cno; - time_t ns_ctime; - time_t ns_nongc_ctime; + time64_t ns_ctime; + time64_t ns_nongc_ctime; atomic_t ns_ndirtyblks; /* @@ -267,7 +267,7 @@ struct nilfs_root { static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) { - u64 t = get_seconds(); + u64 t = ktime_get_real_seconds(); return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index b7520e20a770..977763d4c27d 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1776,7 +1776,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (inode_cmp_iversion(inode, *f_version)) { + if (!inode_eq_iversion(inode, *f_version)) { for (i = 0; i < i_size_read(inode) && i < offset; ) { de = (struct ocfs2_dir_entry *) (data->id_data + i); @@ -1870,7 +1870,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (inode_cmp_iversion(inode, *f_version)) { + if (!inode_eq_iversion(inode, *f_version)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ocfs2_dir_entry *) (bh->b_data + i); /* It's too expensive to do a full diff --git a/fs/pipe.c b/fs/pipe.c index a449ca0ec0c6..0913aed7fd0d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -35,11 +35,6 @@ */ unsigned int pipe_max_size = 1048576; -/* - * Minimum pipe size, as required by POSIX - */ -unsigned int pipe_min_size = PAGE_SIZE; - /* Maximum allocatable pages per user. Hard limit is unset by default, soft * matches default values. */ @@ -610,12 +605,21 @@ static unsigned long account_pipe_buffers(struct user_struct *user, static bool too_many_pipe_buffers_soft(unsigned long user_bufs) { - return pipe_user_pages_soft && user_bufs >= pipe_user_pages_soft; + unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); + + return soft_limit && user_bufs > soft_limit; } static bool too_many_pipe_buffers_hard(unsigned long user_bufs) { - return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard; + unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); + + return hard_limit && user_bufs > hard_limit; +} + +static bool is_unprivileged_user(void) +{ + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); } struct pipe_inode_info *alloc_pipe_info(void) @@ -624,22 +628,23 @@ struct pipe_inode_info *alloc_pipe_info(void) unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); unsigned long user_bufs; + unsigned int max_size = READ_ONCE(pipe_max_size); pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe == NULL) goto out_free_uid; - if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE)) - pipe_bufs = pipe_max_size >> PAGE_SHIFT; + if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) + pipe_bufs = max_size >> PAGE_SHIFT; user_bufs = account_pipe_buffers(user, 0, pipe_bufs); - if (too_many_pipe_buffers_soft(user_bufs)) { + if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { user_bufs = account_pipe_buffers(user, pipe_bufs, 1); pipe_bufs = 1; } - if (too_many_pipe_buffers_hard(user_bufs)) + if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) goto out_revert_acct; pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), @@ -1020,18 +1025,16 @@ const struct file_operations pipefifo_fops = { * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ -unsigned int round_pipe_size(unsigned int size) +unsigned int round_pipe_size(unsigned long size) { - unsigned long nr_pages; - - if (size < pipe_min_size) - size = pipe_min_size; - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages == 0) + if (size > (1U << 31)) return 0; - return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; + /* Minimum pipe size, as required by POSIX */ + if (size < PAGE_SIZE) + return PAGE_SIZE; + + return roundup_pow_of_two(size); } /* @@ -1046,8 +1049,6 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) long ret = 0; size = round_pipe_size(arg); - if (size == 0) - return -EINVAL; nr_pages = size >> PAGE_SHIFT; if (!nr_pages) @@ -1069,7 +1070,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) if (nr_pages > pipe->buffers && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + is_unprivileged_user()) { ret = -EPERM; goto out_revert_acct; } @@ -1125,16 +1126,6 @@ out_revert_acct: } /* - * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size - * will return an error. - */ -int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, - size_t *lenp, loff_t *ppos) -{ - return proc_dopipe_max_size(table, write, buf, lenp, ppos); -} - -/* * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same * location, so checking ->i_pipe is not enough to verify that this is a * pipe. diff --git a/fs/proc/array.c b/fs/proc/array.c index d67a72dcb92c..598803576e4c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -736,16 +736,10 @@ static int children_seq_open(struct inode *inode, struct file *file) return ret; } -int children_seq_release(struct inode *inode, struct file *file) -{ - seq_release(inode, file); - return 0; -} - const struct file_operations proc_tid_children_operations = { .open = children_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = children_seq_release, + .release = seq_release, }; #endif /* CONFIG_PROC_CHILDREN */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 60316b52d659..9298324325ed 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -75,6 +75,7 @@ #include <linux/ptrace.h> #include <linux/tracehook.h> #include <linux/printk.h> +#include <linux/cache.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> @@ -100,6 +101,8 @@ #include "internal.h" #include "fd.h" +#include "../../lib/kstrtox.h" + /* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during @@ -110,8 +113,8 @@ * in /proc for a task before it execs a suid executable. */ -static u8 nlink_tid; -static u8 nlink_tgid; +static u8 nlink_tid __ro_after_init; +static u8 nlink_tgid __ro_after_init; struct pid_entry { const char *name; @@ -1370,7 +1373,7 @@ static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; - WRITE_ONCE(task->fail_nth, n); + task->fail_nth = n; put_task_struct(task); return count; @@ -1386,8 +1389,7 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; - len = snprintf(numbuf, sizeof(numbuf), "%u\n", - READ_ONCE(task->fail_nth)); + len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); len = simple_read_from_buffer(buf, count, ppos, numbuf, len); put_task_struct(task); @@ -1907,8 +1909,33 @@ end_instantiate: static int dname_to_vma_addr(struct dentry *dentry, unsigned long *start, unsigned long *end) { - if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + const char *str = dentry->d_name.name; + unsigned long long sval, eval; + unsigned int len; + + len = _parse_integer(str, 16, &sval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (sval != (unsigned long)sval) return -EINVAL; + str += len; + + if (*str != '-') + return -EINVAL; + str++; + + len = _parse_integer(str, 16, &eval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (eval != (unsigned long)eval) + return -EINVAL; + str += len; + + if (*str != '\0') + return -EINVAL; + + *start = sval; + *end = eval; return 0; } @@ -2000,9 +2027,9 @@ out: } struct map_files_info { + unsigned long start; + unsigned long end; fmode_t mode; - unsigned int len; - unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; /* @@ -2172,10 +2199,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (++pos <= ctx->pos) continue; + info.start = vma->vm_start; + info.end = vma->vm_end; info.mode = vma->vm_file->f_mode; - info.len = snprintf(info.name, - sizeof(info.name), "%lx-%lx", - vma->vm_start, vma->vm_end); if (flex_array_put(fa, i++, &info, GFP_KERNEL)) BUG(); } @@ -2183,9 +2209,13 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) up_read(&mm->mmap_sem); for (i = 0; i < nr_files; i++) { + char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ + unsigned int len; + p = flex_array_get(fa, i); + len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, - p->name, p->len, + buf, len, proc_map_files_instantiate, task, (void *)(unsigned long)p->mode)) @@ -3018,11 +3048,11 @@ static const struct inode_operations proc_tgid_base_inode_operations = { static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) { struct dentry *dentry, *leader, *dir; - char buf[PROC_NUMBUF]; + char buf[10 + 1]; struct qstr name; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", pid); + name.len = snprintf(buf, sizeof(buf), "%u", pid); /* no ->d_hash() rejects on procfs */ dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { @@ -3034,7 +3064,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) return; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", tgid); + name.len = snprintf(buf, sizeof(buf), "%u", tgid); leader = d_hash_and_lookup(mnt->mnt_root, &name); if (!leader) goto out; @@ -3046,7 +3076,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) goto out_put_leader; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", pid); + name.len = snprintf(buf, sizeof(buf), "%u", pid); dentry = d_hash_and_lookup(dir, &name); if (dentry) { d_invalidate(dentry); @@ -3225,14 +3255,14 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; cond_resched(); if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) continue; - len = snprintf(name, sizeof(name), "%d", iter.tgid); + len = snprintf(name, sizeof(name), "%u", iter.tgid); ctx->pos = iter.tgid + TGID_OFFSET; if (!proc_fill_cache(file, ctx, name, len, proc_pid_instantiate, iter.task, NULL)) { @@ -3560,10 +3590,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; tid = task_pid_nr_ns(task, ns); - len = snprintf(name, sizeof(name), "%d", tid); + len = snprintf(name, sizeof(name), "%u", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index 290ba85cb900..a8ac48aebd59 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -55,8 +55,7 @@ static int show_console_dev(struct seq_file *m, void *v) if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); - seq_printf(m, "\n"); - + seq_putc(m, '\n'); return 0; } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 96fc70225e54..6b80cd1e419a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -236,7 +236,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, for (fd = ctx->pos - 2; fd < files_fdtable(files)->max_fds; fd++, ctx->pos++) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; if (!fcheck_files(files, fd)) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 793a67574668..5d709fa8f3a2 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -28,7 +28,7 @@ static DEFINE_RWLOCK(proc_subdir_lock); -static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) +static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) { if (len < de->namelen) return -1; @@ -60,7 +60,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, struct proc_dir_entry *de = rb_entry(node, struct proc_dir_entry, subdir_node); - int result = proc_match(len, name, de); + int result = proc_match(name, de, len); if (result < 0) node = node->rb_left; @@ -84,7 +84,7 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *this = rb_entry(*new, struct proc_dir_entry, subdir_node); - int result = proc_match(de->namelen, de->name, this); + int result = proc_match(de->name, this, de->namelen); parent = *new; if (result < 0) @@ -211,8 +211,8 @@ void proc_free_inum(unsigned int inum) * Don't create negative dentries here, return -ENOENT by hand * instead. */ -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, - struct dentry *dentry) +struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, + struct proc_dir_entry *de) { struct inode *inode; @@ -235,7 +235,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - return proc_lookup_de(PDE(dir), dir, dentry); + return proc_lookup_de(dir, dentry, PDE(dir)); } /* @@ -247,8 +247,8 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, * value of the readdir() call, as long as it's non-negative * for success.. */ -int proc_readdir_de(struct proc_dir_entry *de, struct file *file, - struct dir_context *ctx) +int proc_readdir_de(struct file *file, struct dir_context *ctx, + struct proc_dir_entry *de) { int i; @@ -292,7 +292,7 @@ int proc_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); - return proc_readdir_de(PDE(inode), file, ctx); + return proc_readdir_de(file, ctx, PDE(inode)); } /* diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8dacaabb9f37..6e8724958116 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -5,6 +5,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ +#include <linux/cache.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/kernel.h> @@ -52,7 +53,7 @@ static void proc_evict_inode(struct inode *inode) } } -static struct kmem_cache * proc_inode_cachep; +static struct kmem_cache *proc_inode_cachep __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { @@ -128,12 +129,12 @@ enum {BIAS = -1U<<31}; static inline int use_pde(struct proc_dir_entry *pde) { - return atomic_inc_unless_negative(&pde->in_use); + return likely(atomic_inc_unless_negative(&pde->in_use)); } static void unuse_pde(struct proc_dir_entry *pde) { - if (atomic_dec_return(&pde->in_use) == BIAS) + if (unlikely(atomic_dec_return(&pde->in_use) == BIAS)) complete(pde->pde_unload_completion); } @@ -166,7 +167,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); - if (pdeo->c) + if (unlikely(pdeo->c)) complete(pdeo->c); kfree(pdeo); } @@ -420,7 +421,7 @@ static const char *proc_get_link(struct dentry *dentry, struct delayed_call *done) { struct proc_dir_entry *pde = PDE(inode); - if (unlikely(!use_pde(pde))) + if (!use_pde(pde)) return ERR_PTR(-EINVAL); set_delayed_call(done, proc_put_link, pde); return pde->data; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 4a67188c8d74..d697c8ab0a14 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -31,24 +31,28 @@ struct mempolicy; * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { + /* + * number of callers into module in progress; + * negative -> it's going away RSN + */ + atomic_t in_use; + atomic_t count; /* use count */ + struct list_head pde_openers; /* who did ->open, but not ->release */ + /* protects ->pde_openers and all struct pde_opener instances */ + spinlock_t pde_unload_lock; + struct completion *pde_unload_completion; + const struct inode_operations *proc_iops; + const struct file_operations *proc_fops; + void *data; unsigned int low_ino; - umode_t mode; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; - const struct inode_operations *proc_iops; - const struct file_operations *proc_fops; struct proc_dir_entry *parent; struct rb_root_cached subdir; struct rb_node subdir_node; - void *data; - atomic_t count; /* use count */ - atomic_t in_use; /* number of callers into module in progress; */ - /* negative -> it's going away RSN */ - struct completion *pde_unload_completion; - struct list_head pde_openers; /* who did ->open, but not ->release */ - spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + umode_t mode; u8 namelen; char name[]; } __randomize_layout; @@ -149,10 +153,9 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i * generic.c */ extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); -extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, - struct dentry *); +struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); -extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *); +int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 4bc85cb8be6a..e8a93bc8285d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -512,23 +512,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) return -EFAULT; } else { if (kern_addr_valid(start)) { - unsigned long n; - /* * Using bounce buffer to bypass the * hardened user copy kernel text checks. */ - memcpy(buf, (char *) start, tsz); - n = copy_to_user(buffer, buf, tsz); - /* - * We cannot distinguish between fault on source - * and fault on destination. When this happens - * we clear too and hope it will trigger the - * EFAULT again. - */ - if (n) { - if (clear_user(buffer + tsz - n, - n)) + if (probe_kernel_read(buf, (void *) start, tsz)) { + if (clear_user(buffer, tsz)) + return -EFAULT; + } else { + if (copy_to_user(buffer, buf, tsz)) return -EFAULT; } } else { diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index a2bf369c923d..68c06ae7888c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -135,7 +135,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, de = ERR_PTR(-ENOENT); net = get_proc_task_net(dir); if (net != NULL) { - de = proc_lookup_de(net->proc_net, dir, dentry); + de = proc_lookup_de(dir, dentry, net->proc_net); put_net(net); } return de; @@ -172,7 +172,7 @@ static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) ret = -EINVAL; net = get_proc_task_net(file_inode(file)); if (net != NULL) { - ret = proc_readdir_de(net->proc_net, file, ctx); + ret = proc_readdir_de(file, ctx, net->proc_net); put_net(net); } return ret; diff --git a/fs/proc/self.c b/fs/proc/self.c index 31326bb23b8b..4d7d061696b3 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> @@ -17,11 +18,11 @@ static const char *proc_self_get_link(struct dentry *dentry, if (!tgid) return ERR_PTR(-ENOENT); - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC); + /* max length of unsigned int in decimal + NULL term */ + name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); - sprintf(name, "%d", tgid); + sprintf(name, "%u", tgid); set_delayed_call(done, kfree_link, name); return name; } @@ -30,7 +31,7 @@ static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; -static unsigned self_inum; +static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index b813e3b529f2..9d2efaca499f 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> @@ -18,11 +19,10 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, if (!pid) return ERR_PTR(-ENOENT); - name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, - dentry ? GFP_KERNEL : GFP_ATOMIC); + name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); - sprintf(name, "%d/task/%d", tgid, pid); + sprintf(name, "%u/task/%u", tgid, pid); set_delayed_call(done, kfree_link, name); return name; } @@ -31,7 +31,7 @@ static const struct inode_operations proc_thread_self_inode_operations = { .get_link = proc_thread_self_get_link, }; -static unsigned thread_self_inum; +static unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 885d445afa0d..a45f0af22a60 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1178,18 +1178,16 @@ fs_initcall(vmcore_init); /* Cleanup function for vmcore module. */ void vmcore_cleanup(void) { - struct list_head *pos, *next; - if (proc_vmcore) { proc_remove(proc_vmcore); proc_vmcore = NULL; } /* clear the vmcore list. */ - list_for_each_safe(pos, next, &vmcore_list) { + while (!list_empty(&vmcore_list)) { struct vmcore *m; - m = list_entry(pos, struct vmcore, list); + m = list_first_entry(&vmcore_list, struct vmcore, list); list_del(&m->list); kfree(m); } diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index 14626b34d13e..0927a4b2ecaf 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -62,6 +62,11 @@ udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src) dest->tv_sec -= offset * 60; dest->tv_nsec = 1000 * (src.centiseconds * 10000 + src.hundredsOfMicroseconds * 100 + src.microseconds); + /* + * Sanitize nanosecond field since reportedly some filesystems are + * recorded with bogus sub-second values. + */ + dest->tv_nsec %= NSEC_PER_SEC; return dest; } diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 50dfce000864..b721d0bda5e5 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -429,7 +429,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx) unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); - bool need_revalidate = inode_cmp_iversion(inode, file->f_version); + bool need_revalidate = !inode_eq_iversion(inode, file->f_version); unsigned flags = UFS_SB(sb)->s_flags; UFSD("BEGIN\n"); |