From 114fc47492e23d93653e4a16664833e98d62a563 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 11 Jan 2012 17:41:01 -0800 Subject: ceph: change "ceph.layout" xattr to be "ceph.file.layout" The virtual extended attribute named "ceph.layout" is meaningful only for regular files. Change its name to be "ceph.file.layout" to more directly reflect that in the ceph xattr namespace. Preserve the old "ceph.layout" name for the time being (until we decide it's safe to get rid of it entirely). Add a missing initializer for "readonly" in the terminating entry. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- fs/ceph/xattr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index a5e36e4488a7..9e6734e38c12 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -111,8 +111,10 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_file_vxattrs[] = { + { true, "ceph.file.layout", ceph_vxattrcb_layout}, + /* The following extended attribute name is deprecated */ { true, "ceph.layout", ceph_vxattrcb_layout}, - { NULL, NULL } + { true, NULL, NULL } }; static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) -- cgit v1.2.3 From ab434b60ab07f8c44246b6fb0cddee436687a09a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 13 Jan 2012 22:22:03 -0800 Subject: ceph: initialize client debugfs outside of monc->mutex Initializing debufs under monc->mutex introduces a lock dependency for sb->s_type->i_mutex_key, which (combined with several other dependencies) leads to an annoying lockdep warning. There's no particular reason to do the debugfs setup under this lock, so move it out. It used to be the case that our first monmap could come from the OSD; that is no longer the case with recent servers, so we will reliably set up the client entry during the initial authentication. We don't have to worry about racing with debugfs teardown by ceph_debugfs_client_cleanup() because ceph_destroy_client() calls ceph_msgr_flush() first, which will wait for the message dispatch work to complete (and the debugfs init to complete). Fixes: #1940 Signed-off-by: Sage Weil --- net/ceph/ceph_common.c | 2 -- net/ceph/mon_client.c | 13 ++++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 97f70e50ad3b..761ad9d6cc3b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -85,8 +85,6 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) } else { pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); memcpy(&client->fsid, fsid, sizeof(*fsid)); - ceph_debugfs_client_init(client); - client->have_fsid = true; } return 0; } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 0b62deae42bd..1845cde26227 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -8,8 +8,8 @@ #include #include +#include #include - #include /* @@ -340,8 +340,19 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, client->monc.monmap = monmap; kfree(old); + if (!client->have_fsid) { + client->have_fsid = true; + mutex_unlock(&monc->mutex); + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(client); + goto out_unlocked; + } out: mutex_unlock(&monc->mutex); +out_unlocked: wake_up_all(&client->auth_wq); } -- cgit v1.2.3 From 32852a81bccd9e3d1953b894966393d1b546576d Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Sat, 14 Jan 2012 22:20:59 -0500 Subject: ceph: fix length validation in parse_reply_info() "len" is read from network and thus needs validation. Otherwise, given a bogus "len" value, p+len could be an out-of-bounds pointer, which is used in further parsing. Signed-off-by: Xi Wang Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6203d805eb45..be1415fcaac8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg, /* trace */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { + ceph_decode_need(&p, end, len, bad); err = parse_reply_info_trace(&p, p+len, info, features); if (err < 0) goto out_bad; @@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg, /* extra */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { + ceph_decode_need(&p, end, len, bad); err = parse_reply_info_extra(&p, p+len, info, features); if (err < 0) goto out_bad; -- cgit v1.2.3 From d8fb02abdc39f92a1066313e2b17047876afa8f9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 12 Jan 2012 17:48:10 -0800 Subject: ceph: create a new session lock to avoid lock inversion Lockdep was reporting a possible circular lock dependency in dentry_lease_is_valid(). That function needs to sample the session's s_cap_gen and and s_cap_ttl fields coherently, but needs to do so while holding a dentry lock. The s_cap_lock field was being used to protect the two fields, but that can't be taken while holding a lock on a dentry within the session. In most cases, the s_cap_gen and s_cap_ttl fields only get operated on separately. But in three cases they need to be updated together. Implement a new lock to protect the spots updating both fields atomically is required. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- fs/ceph/caps.c | 4 ++-- fs/ceph/dir.c | 4 ++-- fs/ceph/mds_client.c | 8 +++++--- fs/ceph/mds_client.h | 7 +++++-- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8b53193e4f7c..90d789df9ce0 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap) unsigned long ttl; u32 gen; - spin_lock(&cap->session->s_cap_lock); + spin_lock(&cap->session->s_gen_ttl_lock); gen = cap->session->s_cap_gen; ttl = cap->session->s_cap_ttl; - spin_unlock(&cap->session->s_cap_lock); + spin_unlock(&cap->session->s_gen_ttl_lock); if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { dout("__cap_is_valid %p cap %p issued %s " diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 98954003a8d3..63c52f33361b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -975,10 +975,10 @@ static int dentry_lease_is_valid(struct dentry *dentry) di = ceph_dentry(dentry); if (di && di->lease_session) { s = di->lease_session; - spin_lock(&s->s_cap_lock); + spin_lock(&s->s_gen_ttl_lock); gen = s->s_cap_gen; ttl = s->s_cap_ttl; - spin_unlock(&s->s_cap_lock); + spin_unlock(&s->s_gen_ttl_lock); if (di->lease_gen == gen && time_before(jiffies, dentry->d_time) && diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index be1415fcaac8..a4fdf9397a90 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -400,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; s->s_con.peer_name.num = cpu_to_le64(mds); - spin_lock_init(&s->s_cap_lock); + spin_lock_init(&s->s_gen_ttl_lock); s->s_cap_gen = 0; s->s_cap_ttl = 0; + + spin_lock_init(&s->s_cap_lock); s->s_renew_requested = 0; s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); @@ -2328,10 +2330,10 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_STALE: pr_info("mds%d caps went stale, renewing\n", session->s_mds); - spin_lock(&session->s_cap_lock); + spin_lock(&session->s_gen_ttl_lock); session->s_cap_gen++; session->s_cap_ttl = 0; - spin_unlock(&session->s_cap_lock); + spin_unlock(&session->s_gen_ttl_lock); send_renew_caps(mdsc, session); break; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index a50ca0e39475..8c7c04ebb595 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -117,10 +117,13 @@ struct ceph_mds_session { void *s_authorizer_buf, *s_authorizer_reply_buf; size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; - /* protected by s_cap_lock */ - spinlock_t s_cap_lock; + /* protected by s_gen_ttl_lock */ + spinlock_t s_gen_ttl_lock; u32 s_cap_gen; /* inc each time we get mds stale msg */ unsigned long s_cap_ttl; /* when session caps expire */ + + /* protected by s_cap_lock */ + spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ int s_nr_caps, s_trim_caps; int s_num_cap_releases; -- cgit v1.2.3 From 97bb59a03dd6767fcc00be09b0c6d9e5294eeea6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 24 Jan 2012 10:08:36 -0600 Subject: rbd: fix a memory leak in rbd_get_client() If an existing rbd client is found to be suitable for use in rbd_get_client(), the rbd_options structure is not being freed as it should. Fix that. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 148ab944378d..7d8f8ddb3359 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -380,6 +380,7 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, rbdc = __rbd_client_find(opt); if (rbdc) { ceph_destroy_options(opt); + kfree(rbd_opts); /* using an existing client */ kref_get(&rbdc->kref); -- cgit v1.2.3 From d23a4b3fd6ef70b80411b39b8c8bc548a219ce70 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 29 Jan 2012 13:57:43 -0600 Subject: rbd: fix safety of rbd_put_client() The rbd_client structure uses a kref to arrange for cleaning up and freeing an instance when its last reference is dropped. The cleanup routine is rbd_client_release(), and one of the things it does is delete the rbd_client from rbd_client_list. It acquires node_lock to do so, but the way it is done is still not safe. The problem is that when attempting to reuse an existing rbd_client, the structure found might already be in the process of getting destroyed and cleaned up. Here's the scenario, with "CLIENT" representing an existing rbd_client that's involved in the race: Thread on CPU A | Thread on CPU B --------------- | --------------- rbd_put_client(CLIENT) | rbd_get_client() kref_put() | (acquires node_lock) kref->refcount becomes 0 | __rbd_client_find() returns CLIENT calls rbd_client_release() | kref_get(&CLIENT->kref); | (releases node_lock) (acquires node_lock) | deletes CLIENT from list | ...and starts using CLIENT... (releases node_lock) | and frees CLIENT | <-- but CLIENT gets freed here Fix this by having rbd_put_client() acquire node_lock. The result could still be improved, but at least it avoids this problem. Signed-off-by: Alex Elder Signed-off-by: Sage Weil --- drivers/block/rbd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7d8f8ddb3359..7f40cb4553c9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -407,15 +407,15 @@ done_err: /* * Destroy ceph client + * + * Caller must hold node_lock. */ static void rbd_client_release(struct kref *kref) { struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); dout("rbd_release_client %p\n", rbdc); - spin_lock(&node_lock); list_del(&rbdc->node); - spin_unlock(&node_lock); ceph_destroy_client(rbdc->client); kfree(rbdc->rbd_opts); @@ -428,7 +428,9 @@ static void rbd_client_release(struct kref *kref) */ static void rbd_put_client(struct rbd_device *rbd_dev) { + spin_lock(&node_lock); kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); + spin_unlock(&node_lock); rbd_dev->rbd_client = NULL; rbd_dev->client = NULL; } -- cgit v1.2.3