diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-03-24 18:32:48 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-03-24 18:32:48 -0700 |
commit | 85c7000fda0029ec16569b1eec8fd3a8d026be73 (patch) | |
tree | c6e7544175414fc6aa44f7f46f75242bf8c9b5c4 /fs/ceph/addr.c | |
parent | b1b07ba356f04268230e16a8e1813fe1b19dac54 (diff) | |
parent | f639d9867eea647005dc824e0e24f39ffc50d4e4 (diff) |
Merge tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov:
"The highlights are:
- several changes to how snap context and snap realms are tracked
(Xiubo Li). In particular, this should resolve a long-standing
issue of high kworker CPU usage and various stalls caused by
needless iteration over all inodes in the snap realm.
- async create fixes to address hangs in some edge cases (Jeff
Layton)
- support for getvxattr MDS op for querying server-side xattrs, such
as file/directory layouts and ephemeral pins (Milind Changire)
- average latency is now maintained for all metrics (Venky Shankar)
- some tweaks around handling inline data to make it fit better with
netfs helper library (David Howells)
Also a couple of memory leaks got plugged along with a few assorted
fixups. Last but not least, Xiubo has stepped up to serve as a CephFS
co-maintainer"
* tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client: (27 commits)
ceph: fix memory leak in ceph_readdir when note_last_dentry returns error
ceph: uninitialized variable in debug output
ceph: use tracked average r/w/m latencies to display metrics in debugfs
ceph: include average/stdev r/w/m latency in mds metrics
ceph: track average r/w/m latency
ceph: use ktime_to_timespec64() rather than jiffies_to_timespec64()
ceph: assign the ci only when the inode isn't NULL
ceph: fix inode reference leakage in ceph_get_snapdir()
ceph: misc fix for code style and logs
ceph: allocate capsnap memory outside of ceph_queue_cap_snap()
ceph: do not release the global snaprealm until unmounting
ceph: remove incorrect and unused CEPH_INO_DOTDOT macro
MAINTAINERS: add Xiubo Li as cephfs co-maintainer
ceph: eliminate the recursion when rebuilding the snap context
ceph: do not update snapshot context when there is no new snapshot
ceph: zero the dir_entries memory when allocating it
ceph: move to a dedicated slabcache for ceph_cap_snap
ceph: add getvxattr op
libceph: drop else branches in prepare_read_data{,_cont}
ceph: fix comments mentioning i_mutex
...
Diffstat (limited to 'fs/ceph/addr.c')
-rw-r--r-- | fs/ceph/addr.c | 240 |
1 files changed, 112 insertions, 128 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index f6135c93ce9d..c7a0ab0d298b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -184,7 +184,7 @@ static int ceph_releasepage(struct page *page, gfp_t gfp) static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) { - struct inode *inode = rreq->mapping->host; + struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_layout *lo = &ci->i_layout; u32 blockoff; @@ -201,7 +201,7 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq) { - struct inode *inode = subreq->rreq->mapping->host; + struct inode *inode = subreq->rreq->inode; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 objno, objoff; @@ -244,10 +244,63 @@ static void finish_netfs_read(struct ceph_osd_request *req) iput(req->r_inode); } +static bool ceph_netfs_issue_op_inline(struct netfs_read_subrequest *subreq) +{ + struct netfs_read_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; + struct ceph_mds_reply_info_parsed *rinfo; + struct ceph_mds_reply_info_in *iinfo; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + struct iov_iter iter; + ssize_t err = 0; + size_t len; + + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); + + if (subreq->start >= inode->i_size) + goto out; + + /* We need to fetch the inline data. */ + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_ino1 = ci->i_vino; + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); + req->r_num_caps = 2; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto out; + + rinfo = &req->r_reply_info; + iinfo = &rinfo->targeti; + if (iinfo->inline_version == CEPH_INLINE_NONE) { + /* The data got uninlined */ + ceph_mdsc_put_request(req); + return false; + } + + len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); + if (err == 0) + err = -EFAULT; + + ceph_mdsc_put_request(req); +out: + netfs_subreq_terminated(subreq, err, false); + return true; +} + static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) { struct netfs_read_request *rreq = subreq->rreq; - struct inode *inode = rreq->mapping->host; + struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; @@ -258,6 +311,10 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) int err = 0; u64 len = subreq->len; + if (ci->i_inline_version != CEPH_INLINE_NONE && + ceph_netfs_issue_op_inline(subreq)) + return; + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, 0, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, @@ -326,23 +383,9 @@ static int ceph_readpage(struct file *file, struct page *subpage) size_t len = folio_size(folio); u64 off = folio_file_pos(folio); - if (ci->i_inline_version != CEPH_INLINE_NONE) { - /* - * Uptodate inline data should have been added - * into page cache while getting Fcr caps. - */ - if (off == 0) { - folio_unlock(folio); - return -EINVAL; - } - zero_user_segment(&folio->page, 0, folio_size(folio)); - folio_mark_uptodate(folio); - folio_unlock(folio); - return 0; - } - - dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n", - vino.ino, vino.snap, file, off, len, folio, folio_index(folio)); + dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n inline %d", + vino.ino, vino.snap, file, off, len, folio, folio_index(folio), + ci->i_inline_version != CEPH_INLINE_NONE); return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL); } @@ -1281,45 +1324,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); struct folio *folio = NULL; - pgoff_t index = pos >> PAGE_SHIFT; int r; - /* - * Uninlining should have already been done and everything updated, EXCEPT - * for inline_version sent to the MDS. - */ - if (ci->i_inline_version != CEPH_INLINE_NONE) { - unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - if (aop_flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; - folio = __filemap_get_folio(mapping, index, fgp_flags, - mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; - - /* - * The inline_version on a new inode is set to 1. If that's the - * case, then the folio is brand new and isn't yet Uptodate. - */ - r = 0; - if (index == 0 && ci->i_inline_version != 1) { - if (!folio_test_uptodate(folio)) { - WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n", - ci->i_inline_version); - r = -EINVAL; - } - goto out; - } - zero_user_segment(&folio->page, 0, folio_size(folio)); - folio_mark_uptodate(folio); - goto out; - } - r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL, &ceph_netfs_read_ops, NULL); -out: if (r == 0) folio_wait_fscache(folio); if (r < 0) { @@ -1515,19 +1524,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); - if (ci->i_inline_version != CEPH_INLINE_NONE) { - struct page *locked_page = NULL; - if (off == 0) { - lock_page(page); - locked_page = page; - } - err = ceph_uninline_data(vma->vm_file, locked_page); - if (locked_page) - unlock_page(locked_page); - if (err < 0) - goto out_free; - } - if (off + thp_size(page) <= size) len = thp_size(page); else @@ -1584,11 +1580,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ceph_put_snap_context(snapc); } while (err == 0); - if (ret == VM_FAULT_LOCKED || - ci->i_inline_version != CEPH_INLINE_NONE) { + if (ret == VM_FAULT_LOCKED) { int dirty; spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -1652,16 +1646,30 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, } } -int ceph_uninline_data(struct file *filp, struct page *locked_page) +int ceph_uninline_data(struct file *file) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; - struct page *page = NULL; - u64 len, inline_version; + struct ceph_cap_flush *prealloc_cf; + struct folio *folio = NULL; + u64 inline_version = CEPH_INLINE_NONE; + struct page *pages[1]; int err = 0; - bool from_pagecache = false; + u64 len; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + folio = read_mapping_folio(inode->i_mapping, 0, file); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto out; + } + + folio_lock(folio); spin_lock(&ci->i_ceph_lock); inline_version = ci->i_inline_version; @@ -1672,45 +1680,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) if (inline_version == 1 || /* initial version, no data */ inline_version == CEPH_INLINE_NONE) - goto out; - - if (locked_page) { - page = locked_page; - WARN_ON(!PageUptodate(page)); - } else if (ceph_caps_issued(ci) & - (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { - page = find_get_page(inode->i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - from_pagecache = true; - lock_page(page); - } else { - put_page(page); - page = NULL; - } - } - } + goto out_unlock; - if (page) { - len = i_size_read(inode); - if (len > PAGE_SIZE) - len = PAGE_SIZE; - } else { - page = __page_cache_alloc(GFP_NOFS); - if (!page) { - err = -ENOMEM; - goto out; - } - err = __ceph_do_getattr(inode, page, - CEPH_STAT_CAP_INLINE_DATA, true); - if (err < 0) { - /* no inline data */ - if (err == -ENODATA) - err = 0; - goto out; - } - len = err; - } + len = i_size_read(inode); + if (len > folio_size(folio)) + len = folio_size(folio); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 0, 1, @@ -1718,7 +1692,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) NULL, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); - goto out; + goto out_unlock; } req->r_mtime = inode->i_mtime; @@ -1727,7 +1701,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_osdc_put_request(req); if (err < 0) - goto out; + goto out_unlock; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 1, 3, @@ -1736,10 +1710,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); - goto out; + goto out_unlock; } - osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); + pages[0] = folio_page(folio, 0); + osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false); { __le64 xattr_buf = cpu_to_le64(inline_version); @@ -1749,7 +1724,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) CEPH_OSD_CMPXATTR_OP_GT, CEPH_OSD_CMPXATTR_MODE_U64); if (err) - goto out_put; + goto out_put_req; } { @@ -1760,7 +1735,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) "inline_version", xattr_buf, xattr_len, 0, 0); if (err) - goto out_put; + goto out_put_req; } req->r_mtime = inode->i_mtime; @@ -1771,19 +1746,28 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); -out_put: + if (!err) { + int dirty; + + /* Set to CAP_INLINE_NONE and dirty the caps */ + down_read(&fsc->mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + up_read(&fsc->mdsc->snap_rwsem); + if (dirty) + __mark_inode_dirty(inode, dirty); + } +out_put_req: ceph_osdc_put_request(req); if (err == -ECANCELED) err = 0; +out_unlock: + folio_unlock(folio); + folio_put(folio); out: - if (page && page != locked_page) { - if (from_pagecache) { - unlock_page(page); - put_page(page); - } else - __free_pages(page, 0); - } - + ceph_free_cap_flush(prealloc_cf); dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", inode, ceph_vinop(inode), inline_version, err); return err; |