From 2794a82a11cfeae0890741b18b0049ddb55ce646 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: separate osd request data info Pull the fields in an osd request structure that define the data for the request out into a separate structure. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b7b7a88d9f68..0e814dfda48e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1425,12 +1425,12 @@ static struct ceph_osd_request *rbd_osd_req_create( break; /* Nothing to do */ case OBJ_REQUEST_BIO: rbd_assert(obj_request->bio_list != NULL); - osd_req->r_bio = obj_request->bio_list; + osd_req->r_data.bio = obj_request->bio_list; break; case OBJ_REQUEST_PAGES: - osd_req->r_pages = obj_request->pages; - osd_req->r_num_pages = obj_request->page_count; - osd_req->r_page_alignment = offset & ~PAGE_MASK; + osd_req->r_data.pages = obj_request->pages; + osd_req->r_data.num_pages = obj_request->page_count; + osd_req->r_data.alignment = offset & ~PAGE_MASK; break; } -- cgit v1.2.3 From 2ac2b7a6d4976bd6b5dc0751aa77d12d48d3ac4c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: distinguish page and bio requests An osd request uses either pages or a bio list for its data. Use a union to record information about the two, and add a data type tag to select between them. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 +++ fs/ceph/addr.c | 4 +++ fs/ceph/file.c | 1 + include/linux/ceph/osd_client.h | 11 +++++++- net/ceph/osd_client.c | 56 ++++++++++++++++++++++++++--------------- 5 files changed, 55 insertions(+), 21 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0e814dfda48e..f189bc2909b0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1425,12 +1425,16 @@ static struct ceph_osd_request *rbd_osd_req_create( break; /* Nothing to do */ case OBJ_REQUEST_BIO: rbd_assert(obj_request->bio_list != NULL); + osd_req->r_data.type = CEPH_OSD_DATA_TYPE_BIO; osd_req->r_data.bio = obj_request->bio_list; break; case OBJ_REQUEST_PAGES: + osd_req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; osd_req->r_data.pages = obj_request->pages; osd_req->r_data.num_pages = obj_request->page_count; osd_req->r_data.alignment = offset & ~PAGE_MASK; + osd_req->r_data.pages_from_pool = false; + osd_req->r_data.own_pages = false; break; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3a1a77b0ae9f..276fe96f12e3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -243,6 +243,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ + BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES); for (i = 0; i < req->r_data.num_pages; i++, bytes -= PAGE_CACHE_SIZE) { struct page *page = req->r_data.pages[i]; @@ -336,6 +337,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } + req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data.pages = pages; req->r_data.num_pages = nr_pages; req->r_data.alignment = 0; @@ -561,6 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req, long writeback_stat; unsigned issued = ceph_caps_issued(ci); + BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -830,6 +833,7 @@ get_more_pages: break; } + req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data.num_pages = calc_pages_for(0, len); req->r_data.alignment = 0; max_pages = req->r_data.num_pages; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d35fc05af06f..3643a386ab23 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -571,6 +571,7 @@ more: req->r_data.own_pages = 1; } } + req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data.pages = pages; req->r_data.num_pages = num_pages; req->r_data.alignment = page_align; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 600b8278d11e..56604b33dc3c 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -50,8 +50,17 @@ struct ceph_osd { #define CEPH_OSD_MAX_OP 10 +enum ceph_osd_data_type { + CEPH_OSD_DATA_TYPE_NONE, + CEPH_OSD_DATA_TYPE_PAGES, +#ifdef CONFIG_BLOCK + CEPH_OSD_DATA_TYPE_BIO, +#endif /* CONFIG_BLOCK */ +}; + struct ceph_osd_data { - struct { + enum ceph_osd_data_type type; + union { struct { struct page **pages; u32 num_pages; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1f8c7a7c203b..591e1b0cccbe 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -122,7 +122,8 @@ void ceph_osdc_release_request(struct kref *kref) } if (req->r_reply) ceph_msg_put(req->r_reply); - if (req->r_data.own_pages) + if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES && + req->r_data.own_pages) ceph_release_page_vector(req->r_data.pages, req->r_data.num_pages); ceph_put_snap_context(req->r_snapc); @@ -188,6 +189,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } req->r_reply = msg; + req->r_data.type = CEPH_OSD_DATA_TYPE_NONE; ceph_pagelist_init(&req->r_trail); /* create request message; allow space for oid */ @@ -1739,12 +1741,17 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, { int rc = 0; - req->r_request->pages = req->r_data.pages; - req->r_request->page_count = req->r_data.num_pages; - req->r_request->page_alignment = req->r_data.alignment; + if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) { + req->r_request->pages = req->r_data.pages; + req->r_request->page_count = req->r_data.num_pages; + req->r_request->page_alignment = req->r_data.alignment; #ifdef CONFIG_BLOCK - req->r_request->bio = req->r_data.bio; + } else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) { + req->r_request->bio = req->r_data.bio; #endif + } else { + pr_err("unknown request data type %d\n", req->r_data.type); + } req->r_request->trail = &req->r_trail; register_request(osdc, req); @@ -1944,6 +1951,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, return PTR_ERR(req); /* it may be a short read due to an object boundary */ + req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data.pages = pages; req->r_data.num_pages = calc_pages_for(page_align, *plen); req->r_data.alignment = page_align; @@ -1987,6 +1995,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, return PTR_ERR(req); /* it may be a short write due to an object boundary */ + req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data.pages = pages; req->r_data.num_pages = calc_pages_for(page_align, len); req->r_data.alignment = page_align; @@ -2083,23 +2092,30 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - int want = calc_pages_for(req->r_data.alignment, data_len); - - if (req->r_data.pages && unlikely(req->r_data.num_pages < want)) { - pr_warning("tid %lld reply has %d bytes %d pages, we" - " had only %d pages ready\n", tid, data_len, - want, req->r_data.num_pages); - *skip = 1; - ceph_msg_put(m); - m = NULL; - goto out; - } - m->pages = req->r_data.pages; - m->page_count = req->r_data.num_pages; - m->page_alignment = req->r_data.alignment; + if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) { + int want; + + want = calc_pages_for(req->r_data.alignment, data_len); + if (req->r_data.pages && + unlikely(req->r_data.num_pages < want)) { + + pr_warning("tid %lld reply has %d bytes %d " + "pages, we had only %d pages ready\n", + tid, data_len, want, + req->r_data.num_pages); + *skip = 1; + ceph_msg_put(m); + m = NULL; + goto out; + } + m->pages = req->r_data.pages; + m->page_count = req->r_data.num_pages; + m->page_alignment = req->r_data.alignment; #ifdef CONFIG_BLOCK - m->bio = req->r_data.bio; + } else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) { + m->bio = req->r_data.bio; #endif + } } *skip = 0; req->r_con_filling_msg = con->ops->get(con); -- cgit v1.2.3 From 0fff87ec798abdb4a99f01cbb0197266bb68c5dc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: separate read and write data An osd request defines information about where data to be read should be placed as well as where data to write comes from. Currently these are represented by common fields. Keep information about data for writing separate from data to be read by splitting these into data_in and data_out fields. This is the key patch in this whole series, in that it actually identifies which osd requests generate outgoing data and which generate incoming data. It's less obvious (currently) that an osd CALL op generates both outgoing and incoming data; that's the focus of some upcoming work. This resolves: http://tracker.ceph.com/issues/4127 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 18 +++++---- fs/ceph/addr.c | 67 ++++++++++++++++++--------------- fs/ceph/file.c | 10 ++--- include/linux/ceph/osd_client.h | 5 ++- net/ceph/osd_client.c | 83 +++++++++++++++++++++++++---------------- 5 files changed, 105 insertions(+), 78 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f189bc2909b0..3f69eb1bc656 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1398,6 +1398,7 @@ static struct ceph_osd_request *rbd_osd_req_create( struct ceph_snap_context *snapc = NULL; struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; + struct ceph_osd_data *osd_data; struct timespec now; struct timespec *mtime; u64 snap_id = CEPH_NOSNAP; @@ -1418,6 +1419,7 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); if (!osd_req) return NULL; /* ENOMEM */ + osd_data = write_request ? &osd_req->r_data_out : &osd_req->r_data_in; rbd_assert(obj_request_type_valid(obj_request->type)); switch (obj_request->type) { @@ -1425,16 +1427,16 @@ static struct ceph_osd_request *rbd_osd_req_create( break; /* Nothing to do */ case OBJ_REQUEST_BIO: rbd_assert(obj_request->bio_list != NULL); - osd_req->r_data.type = CEPH_OSD_DATA_TYPE_BIO; - osd_req->r_data.bio = obj_request->bio_list; + osd_data->type = CEPH_OSD_DATA_TYPE_BIO; + osd_data->bio = obj_request->bio_list; break; case OBJ_REQUEST_PAGES: - osd_req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - osd_req->r_data.pages = obj_request->pages; - osd_req->r_data.num_pages = obj_request->page_count; - osd_req->r_data.alignment = offset & ~PAGE_MASK; - osd_req->r_data.pages_from_pool = false; - osd_req->r_data.own_pages = false; + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = obj_request->pages; + osd_data->num_pages = obj_request->page_count; + osd_data->alignment = offset & ~PAGE_MASK; + osd_data->pages_from_pool = false; + osd_data->own_pages = false; break; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 276fe96f12e3..c117c51741d5 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -243,9 +243,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES); - for (i = 0; i < req->r_data.num_pages; i++, bytes -= PAGE_CACHE_SIZE) { - struct page *page = req->r_data.pages[i]; + BUG_ON(req->r_data_in.type != CEPH_OSD_DATA_TYPE_PAGES); + for (i = 0; i < req->r_data_in.num_pages; i++) { + struct page *page = req->r_data_in.pages[i]; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ @@ -258,8 +258,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) SetPageUptodate(page); unlock_page(page); page_cache_release(page); + bytes -= PAGE_CACHE_SIZE; } - kfree(req->r_data.pages); + kfree(req->r_data_in.pages); } static void ceph_unlock_page_vector(struct page **pages, int num_pages) @@ -337,10 +338,10 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data.pages = pages; - req->r_data.num_pages = nr_pages; - req->r_data.alignment = 0; + req->r_data_in.type = CEPH_OSD_DATA_TYPE_PAGES; + req->r_data_in.pages = pages; + req->r_data_in.num_pages = nr_pages; + req->r_data_in.alignment = 0; req->r_callback = finish_read; req->r_inode = inode; @@ -563,7 +564,7 @@ static void writepages_finish(struct ceph_osd_request *req, long writeback_stat; unsigned issued = ceph_caps_issued(ci); - BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES); + BUG_ON(req->r_data_out.type != CEPH_OSD_DATA_TYPE_PAGES); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -571,7 +572,7 @@ static void writepages_finish(struct ceph_osd_request *req, * raced with a truncation and was adjusted at the osd, * so don't believe the reply. */ - wrote = req->r_data.num_pages; + wrote = req->r_data_out.num_pages; } else { wrote = 0; mapping_set_error(mapping, rc); @@ -580,8 +581,8 @@ static void writepages_finish(struct ceph_osd_request *req, inode, rc, bytes, wrote); /* clean all pages */ - for (i = 0; i < req->r_data.num_pages; i++) { - page = req->r_data.pages[i]; + for (i = 0; i < req->r_data_out.num_pages; i++) { + page = req->r_data_out.pages[i]; BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -610,31 +611,34 @@ static void writepages_finish(struct ceph_osd_request *req, unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, req->r_data.num_pages, snapc); + ceph_put_wrbuffer_cap_refs(ci, req->r_data_out.num_pages, snapc); - ceph_release_pages(req->r_data.pages, req->r_data.num_pages); - if (req->r_data.pages_from_pool) - mempool_free(req->r_data.pages, + ceph_release_pages(req->r_data_out.pages, req->r_data_out.num_pages); + if (req->r_data_out.pages_from_pool) + mempool_free(req->r_data_out.pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else - kfree(req->r_data.pages); + kfree(req->r_data_out.pages); ceph_osdc_put_request(req); } /* * allocate a page vec, either directly, or if necessary, via a the - * mempool. we avoid the mempool if we can because req->r_data.num_pages + * mempool. we avoid the mempool if we can because req->r_data_out.num_pages * may be less than the maximum write size. */ static void alloc_page_vec(struct ceph_fs_client *fsc, struct ceph_osd_request *req) { - req->r_data.pages = kmalloc(sizeof(struct page *) * req->r_data.num_pages, - GFP_NOFS); - if (!req->r_data.pages) { - req->r_data.pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); - req->r_data.pages_from_pool = 1; - WARN_ON(!req->r_data.pages); + size_t size; + + size = sizeof (struct page *) * req->r_data_out.num_pages; + req->r_data_out.pages = kmalloc(size, GFP_NOFS); + if (!req->r_data_out.pages) { + req->r_data_out.pages = mempool_alloc(fsc->wb_pagevec_pool, + GFP_NOFS); + req->r_data_out.pages_from_pool = 1; + WARN_ON(!req->r_data_out.pages); } } @@ -833,10 +837,11 @@ get_more_pages: break; } - req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data.num_pages = calc_pages_for(0, len); - req->r_data.alignment = 0; - max_pages = req->r_data.num_pages; + req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; + req->r_data_out.num_pages = + calc_pages_for(0, len); + req->r_data_out.alignment = 0; + max_pages = req->r_data_out.num_pages; alloc_page_vec(fsc, req); req->r_callback = writepages_finish; @@ -858,7 +863,7 @@ get_more_pages: } set_page_writeback(page); - req->r_data.pages[locked_pages] = page; + req->r_data_out.pages[locked_pages] = page; locked_pages++; next = page->index + 1; } @@ -888,14 +893,14 @@ get_more_pages: } /* submit the write */ - offset = req->r_data.pages[0]->index << PAGE_CACHE_SHIFT; + offset = req->r_data_out.pages[0]->index << PAGE_CACHE_SHIFT; len = min((snap_size ? snap_size : i_size_read(inode)) - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); /* revise final length, page count */ - req->r_data.num_pages = locked_pages; + req->r_data_out.num_pages = locked_pages; req->r_request_ops[0].extent.length = cpu_to_le64(len); req->r_request_ops[0].payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3643a386ab23..501fb37b81a2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -568,13 +568,13 @@ more: if ((file->f_flags & O_SYNC) == 0) { /* get a second commit callback */ req->r_safe_callback = sync_write_commit; - req->r_data.own_pages = 1; + req->r_data_out.own_pages = 1; } } - req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data.pages = pages; - req->r_data.num_pages = num_pages; - req->r_data.alignment = page_align; + req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; + req->r_data_out.pages = pages; + req->r_data_out.num_pages = num_pages; + req->r_data_out.alignment = page_align; req->r_inode = inode; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 56604b33dc3c..40e02603723d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -130,8 +130,9 @@ struct ceph_osd_request { struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ - struct ceph_osd_data r_data; - struct ceph_pagelist r_trail; /* trailing part of the data */ + struct ceph_osd_data r_data_in; + struct ceph_osd_data r_data_out; + struct ceph_pagelist r_trail; /* trailing part of data out */ }; struct ceph_osd_event { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 591e1b0cccbe..f9cf44504484 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -122,10 +122,16 @@ void ceph_osdc_release_request(struct kref *kref) } if (req->r_reply) ceph_msg_put(req->r_reply); - if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES && - req->r_data.own_pages) - ceph_release_page_vector(req->r_data.pages, - req->r_data.num_pages); + + if (req->r_data_in.type == CEPH_OSD_DATA_TYPE_PAGES && + req->r_data_in.own_pages) + ceph_release_page_vector(req->r_data_in.pages, + req->r_data_in.num_pages); + if (req->r_data_out.type == CEPH_OSD_DATA_TYPE_PAGES && + req->r_data_out.own_pages) + ceph_release_page_vector(req->r_data_out.pages, + req->r_data_out.num_pages); + ceph_put_snap_context(req->r_snapc); ceph_pagelist_release(&req->r_trail); if (req->r_mempool) @@ -189,7 +195,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } req->r_reply = msg; - req->r_data.type = CEPH_OSD_DATA_TYPE_NONE; + req->r_data_in.type = CEPH_OSD_DATA_TYPE_NONE; + req->r_data_out.type = CEPH_OSD_DATA_TYPE_NONE; ceph_pagelist_init(&req->r_trail); /* create request message; allow space for oid */ @@ -1740,17 +1747,21 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, bool nofail) { int rc = 0; + struct ceph_osd_data *osd_data; + + /* Set up outgoing data */ - if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) { - req->r_request->pages = req->r_data.pages; - req->r_request->page_count = req->r_data.num_pages; - req->r_request->page_alignment = req->r_data.alignment; + osd_data = &req->r_data_out; + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + req->r_request->pages = osd_data->pages; + req->r_request->page_count = osd_data->num_pages; + req->r_request->page_alignment = osd_data->alignment; #ifdef CONFIG_BLOCK - } else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) { - req->r_request->bio = req->r_data.bio; + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { + req->r_request->bio = osd_data->bio; #endif } else { - pr_err("unknown request data type %d\n", req->r_data.type); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } req->r_request->trail = &req->r_trail; @@ -1939,6 +1950,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct page **pages, int num_pages, int page_align) { struct ceph_osd_request *req; + struct ceph_osd_data *osd_data; int rc = 0; dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, @@ -1951,13 +1963,15 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, return PTR_ERR(req); /* it may be a short read due to an object boundary */ - req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data.pages = pages; - req->r_data.num_pages = calc_pages_for(page_align, *plen); - req->r_data.alignment = page_align; + + osd_data = &req->r_data_in; + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = pages; + osd_data->num_pages = calc_pages_for(page_align, *plen); + osd_data->alignment = page_align; dout("readpages final extent is %llu~%llu (%d pages align %d)\n", - off, *plen, req->r_data.num_pages, page_align); + off, *plen, osd_data->num_pages, page_align); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -1981,6 +1995,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, struct page **pages, int num_pages) { struct ceph_osd_request *req; + struct ceph_osd_data *osd_data; int rc = 0; int page_align = off & ~PAGE_MASK; @@ -1995,11 +2010,13 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, return PTR_ERR(req); /* it may be a short write due to an object boundary */ - req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data.pages = pages; - req->r_data.num_pages = calc_pages_for(page_align, len); - req->r_data.alignment = page_align; - dout("writepages %llu~%llu (%d pages)\n", off, len, req->r_data.num_pages); + osd_data = &req->r_data_out; + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = pages; + osd_data->num_pages = calc_pages_for(page_align, len); + osd_data->alignment = page_align; + dout("writepages %llu~%llu (%d pages)\n", off, len, + osd_data->num_pages); rc = ceph_osdc_start_request(osdc, req, true); if (!rc) @@ -2092,28 +2109,30 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) { + struct ceph_osd_data *osd_data = &req->r_data_in; + + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { int want; - want = calc_pages_for(req->r_data.alignment, data_len); - if (req->r_data.pages && - unlikely(req->r_data.num_pages < want)) { + want = calc_pages_for(osd_data->alignment, data_len); + if (osd_data->pages && + unlikely(osd_data->num_pages < want)) { pr_warning("tid %lld reply has %d bytes %d " "pages, we had only %d pages ready\n", tid, data_len, want, - req->r_data.num_pages); + osd_data->num_pages); *skip = 1; ceph_msg_put(m); m = NULL; goto out; } - m->pages = req->r_data.pages; - m->page_count = req->r_data.num_pages; - m->page_alignment = req->r_data.alignment; + m->pages = osd_data->pages; + m->page_count = osd_data->num_pages; + m->page_alignment = osd_data->alignment; #ifdef CONFIG_BLOCK - } else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) { - m->bio = req->r_data.bio; + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { + m->bio = osd_data->bio; #endif } } -- cgit v1.2.3 From e0c594878e3211b09208c779df5f996f0b831d9e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 Mar 2013 15:38:25 -0600 Subject: libceph: record byte count not page count Record the byte count for an osd request rather than the page count. The number of pages can always be derived from the byte count (and alignment/offset) but the reverse is not true. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- fs/ceph/addr.c | 33 ++++++++++++++++----------- fs/ceph/file.c | 2 +- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 50 ++++++++++++++++++++++++----------------- 5 files changed, 52 insertions(+), 37 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3f69eb1bc656..04cd5fdfc8f3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1433,7 +1433,7 @@ static struct ceph_osd_request *rbd_osd_req_create( case OBJ_REQUEST_PAGES: osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; osd_data->pages = obj_request->pages; - osd_data->num_pages = obj_request->page_count; + osd_data->length = obj_request->length; osd_data->alignment = offset & ~PAGE_MASK; osd_data->pages_from_pool = false; osd_data->own_pages = false; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c117c51741d5..45745aae4786 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -238,13 +238,16 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) struct inode *inode = req->r_inode; int rc = req->r_result; int bytes = le32_to_cpu(msg->hdr.data_len); + int num_pages; int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ BUG_ON(req->r_data_in.type != CEPH_OSD_DATA_TYPE_PAGES); - for (i = 0; i < req->r_data_in.num_pages; i++) { + num_pages = calc_pages_for((u64)req->r_data_in.alignment, + (u64)req->r_data_in.length); + for (i = 0; i < num_pages; i++) { struct page *page = req->r_data_in.pages[i]; if (bytes < (int)PAGE_CACHE_SIZE) { @@ -340,7 +343,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } req->r_data_in.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data_in.pages = pages; - req->r_data_in.num_pages = nr_pages; + req->r_data_in.length = len; req->r_data_in.alignment = 0; req->r_callback = finish_read; req->r_inode = inode; @@ -555,6 +558,7 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_inode_info *ci = ceph_inode(inode); unsigned wrote; struct page *page; + int num_pages; int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; @@ -565,6 +569,8 @@ static void writepages_finish(struct ceph_osd_request *req, unsigned issued = ceph_caps_issued(ci); BUG_ON(req->r_data_out.type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)req->r_data_out.alignment, + (u64)req->r_data_out.length); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -572,7 +578,7 @@ static void writepages_finish(struct ceph_osd_request *req, * raced with a truncation and was adjusted at the osd, * so don't believe the reply. */ - wrote = req->r_data_out.num_pages; + wrote = num_pages; } else { wrote = 0; mapping_set_error(mapping, rc); @@ -581,7 +587,7 @@ static void writepages_finish(struct ceph_osd_request *req, inode, rc, bytes, wrote); /* clean all pages */ - for (i = 0; i < req->r_data_out.num_pages; i++) { + for (i = 0; i < num_pages; i++) { page = req->r_data_out.pages[i]; BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -611,9 +617,9 @@ static void writepages_finish(struct ceph_osd_request *req, unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, req->r_data_out.num_pages, snapc); + ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); - ceph_release_pages(req->r_data_out.pages, req->r_data_out.num_pages); + ceph_release_pages(req->r_data_out.pages, num_pages); if (req->r_data_out.pages_from_pool) mempool_free(req->r_data_out.pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); @@ -624,15 +630,18 @@ static void writepages_finish(struct ceph_osd_request *req, /* * allocate a page vec, either directly, or if necessary, via a the - * mempool. we avoid the mempool if we can because req->r_data_out.num_pages + * mempool. we avoid the mempool if we can because req->r_data_out.length * may be less than the maximum write size. */ static void alloc_page_vec(struct ceph_fs_client *fsc, struct ceph_osd_request *req) { size_t size; + int num_pages; - size = sizeof (struct page *) * req->r_data_out.num_pages; + num_pages = calc_pages_for((u64)req->r_data_out.alignment, + (u64)req->r_data_out.length); + size = sizeof (struct page *) * num_pages; req->r_data_out.pages = kmalloc(size, GFP_NOFS); if (!req->r_data_out.pages) { req->r_data_out.pages = mempool_alloc(fsc->wb_pagevec_pool, @@ -838,11 +847,9 @@ get_more_pages: } req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data_out.num_pages = - calc_pages_for(0, len); + req->r_data_out.length = len; req->r_data_out.alignment = 0; - max_pages = req->r_data_out.num_pages; - + max_pages = calc_pages_for(0, (u64)len); alloc_page_vec(fsc, req); req->r_callback = writepages_finish; req->r_inode = inode; @@ -900,7 +907,7 @@ get_more_pages: locked_pages, offset, len); /* revise final length, page count */ - req->r_data_out.num_pages = locked_pages; + req->r_data_out.length = len; req->r_request_ops[0].extent.length = cpu_to_le64(len); req->r_request_ops[0].payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 501fb37b81a2..0ac6e159bdc6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -573,7 +573,7 @@ more: } req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; req->r_data_out.pages = pages; - req->r_data_out.num_pages = num_pages; + req->r_data_out.length = len; req->r_data_out.alignment = page_align; req->r_inode = inode; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 40e02603723d..a8016dfbfdba 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -63,7 +63,7 @@ struct ceph_osd_data { union { struct { struct page **pages; - u32 num_pages; + u64 length; u32 alignment; bool pages_from_pool; bool own_pages; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f9cf44504484..202af14dc6dc 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -107,6 +107,7 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, */ void ceph_osdc_release_request(struct kref *kref) { + int num_pages; struct ceph_osd_request *req = container_of(kref, struct ceph_osd_request, r_kref); @@ -124,13 +125,17 @@ void ceph_osdc_release_request(struct kref *kref) ceph_msg_put(req->r_reply); if (req->r_data_in.type == CEPH_OSD_DATA_TYPE_PAGES && - req->r_data_in.own_pages) - ceph_release_page_vector(req->r_data_in.pages, - req->r_data_in.num_pages); + req->r_data_in.own_pages) { + num_pages = calc_pages_for((u64)req->r_data_in.alignment, + (u64)req->r_data_in.length); + ceph_release_page_vector(req->r_data_in.pages, num_pages); + } if (req->r_data_out.type == CEPH_OSD_DATA_TYPE_PAGES && - req->r_data_out.own_pages) - ceph_release_page_vector(req->r_data_out.pages, - req->r_data_out.num_pages); + req->r_data_out.own_pages) { + num_pages = calc_pages_for((u64)req->r_data_out.alignment, + (u64)req->r_data_out.length); + ceph_release_page_vector(req->r_data_out.pages, num_pages); + } ceph_put_snap_context(req->r_snapc); ceph_pagelist_release(&req->r_trail); @@ -1753,8 +1758,12 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, osd_data = &req->r_data_out; if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + unsigned int page_count; + req->r_request->pages = osd_data->pages; - req->r_request->page_count = osd_data->num_pages; + page_count = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + req->r_request->page_count = page_count; req->r_request->page_alignment = osd_data->alignment; #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { @@ -1967,11 +1976,11 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, osd_data = &req->r_data_in; osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; osd_data->pages = pages; - osd_data->num_pages = calc_pages_for(page_align, *plen); + osd_data->length = *plen; osd_data->alignment = page_align; - dout("readpages final extent is %llu~%llu (%d pages align %d)\n", - off, *plen, osd_data->num_pages, page_align); + dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", + off, *plen, osd_data->length, page_align); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -2013,10 +2022,9 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, osd_data = &req->r_data_out; osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; osd_data->pages = pages; - osd_data->num_pages = calc_pages_for(page_align, len); + osd_data->length = len; osd_data->alignment = page_align; - dout("writepages %llu~%llu (%d pages)\n", off, len, - osd_data->num_pages); + dout("writepages %llu~%llu (%llu bytes)\n", off, len, osd_data->length); rc = ceph_osdc_start_request(osdc, req, true); if (!rc) @@ -2112,23 +2120,23 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_osd_data *osd_data = &req->r_data_in; if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { - int want; + unsigned int page_count; - want = calc_pages_for(osd_data->alignment, data_len); if (osd_data->pages && - unlikely(osd_data->num_pages < want)) { + unlikely(osd_data->length < data_len)) { - pr_warning("tid %lld reply has %d bytes %d " - "pages, we had only %d pages ready\n", - tid, data_len, want, - osd_data->num_pages); + pr_warning("tid %lld reply has %d bytes " + "we had only %llu bytes ready\n", + tid, data_len, osd_data->length); *skip = 1; ceph_msg_put(m); m = NULL; goto out; } + page_count = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); m->pages = osd_data->pages; - m->page_count = osd_data->num_pages; + m->page_count = page_count; m->page_alignment = osd_data->alignment; #ifdef CONFIG_BLOCK } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { -- cgit v1.2.3 From 175face2ba31025b0dcd6da4e711fca7764287fa Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 13:35:36 -0600 Subject: libceph: let osd ops determine request data length The length of outgoing data in an osd request is dependent on the osd ops that are embedded in that request. Each op is encoded into a request message using osd_req_encode_op(), so that should be used to determine the amount of outgoing data implied by the op as it is encoded. Have osd_req_encode_op() return the number of bytes of outgoing data implied by the op being encoded, and accumulate and use that in ceph_osdc_build_request(). As a result, ceph_osdc_build_request() no longer requires its "len" parameter, so get rid of it. Using the sum of the op lengths rather than the length provided is a valid change because: - The only callers of osd ceph_osdc_build_request() are rbd and the osd client (in ceph_osdc_new_request() on behalf of the file system). - When rbd calls it, the length provided is only non-zero for write requests, and in that case the single op has the same length value as what was passed here. - When called from ceph_osdc_new_request(), (it's not all that easy to see, but) the length passed is also always the same as the extent length encoded in its (single) write op if present. This resolves: http://tracker.ceph.com/issues/4406 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- include/linux/ceph/osd_client.h | 3 +-- net/ceph/osd_client.c | 33 +++++++++++++++++++-------------- 3 files changed, 21 insertions(+), 17 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 04cd5fdfc8f3..dea4401c4f77 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1462,7 +1462,7 @@ static struct ceph_osd_request *rbd_osd_req_create( /* osd_req will get its own reference to snapc (if non-null) */ - ceph_osdc_build_request(osd_req, offset, length, 1, op, + ceph_osdc_build_request(osd_req, offset, 1, op, snapc, snap_id, mtime); return osd_req; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index a8016dfbfdba..bcf3f72ec3f8 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -249,8 +249,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * bool use_mempool, gfp_t gfp_flags); -extern void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 len, +extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, unsigned int num_op, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 37d89614a61b..ce34faaa453f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -222,10 +222,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_alloc_request); -static void osd_req_encode_op(struct ceph_osd_request *req, +static u64 osd_req_encode_op(struct ceph_osd_request *req, struct ceph_osd_op *dst, struct ceph_osd_req_op *src) { + u64 out_data_len = 0; + u64 tmp; + dst->op = cpu_to_le16(src->op); switch (src->op) { @@ -233,10 +236,10 @@ static void osd_req_encode_op(struct ceph_osd_request *req, break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: - dst->extent.offset = - cpu_to_le64(src->extent.offset); - dst->extent.length = - cpu_to_le64(src->extent.length); + if (src->op == CEPH_OSD_OP_WRITE) + out_data_len = src->extent.length; + dst->extent.offset = cpu_to_le64(src->extent.offset); + dst->extent.length = cpu_to_le64(src->extent.length); dst->extent.truncate_size = cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = @@ -247,12 +250,14 @@ static void osd_req_encode_op(struct ceph_osd_request *req, dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); + tmp = req->r_trail.length; ceph_pagelist_append(&req->r_trail, src->cls.class_name, src->cls.class_len); ceph_pagelist_append(&req->r_trail, src->cls.method_name, src->cls.method_len); ceph_pagelist_append(&req->r_trail, src->cls.indata, src->cls.indata_len); + out_data_len = req->r_trail.length - tmp; break; case CEPH_OSD_OP_STARTSYNC: break; @@ -326,6 +331,8 @@ static void osd_req_encode_op(struct ceph_osd_request *req, break; } dst->payload_len = cpu_to_le32(src->payload_len); + + return out_data_len; } /* @@ -333,7 +340,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, * */ void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 len, unsigned int num_ops, + u64 off, unsigned int num_ops, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime) @@ -385,12 +392,13 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); p += req->r_oid_len; - /* ops */ + /* ops--can imply data */ ceph_encode_16(&p, num_ops); src_op = src_ops; req->r_request_ops = p; + data_len = 0; for (i = 0; i < num_ops; i++, src_op++) { - osd_req_encode_op(req, p, src_op); + data_len += osd_req_encode_op(req, p, src_op); p += sizeof(struct ceph_osd_op); } @@ -407,11 +415,9 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, req->r_request_attempts = p; p += 4; - data_len = req->r_trail.length; - if (flags & CEPH_OSD_FLAG_WRITE) { + /* data */ + if (flags & CEPH_OSD_FLAG_WRITE) req->r_request->hdr.data_off = cpu_to_le16(off); - data_len += len; - } req->r_request->hdr.data_len = cpu_to_le32(data_len); BUG_ON(p > msg->front.iov_base + msg->front.iov_len); @@ -477,13 +483,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, ceph_osdc_put_request(req); return ERR_PTR(r); } - req->r_file_layout = *layout; /* keep a copy */ snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); - ceph_osdc_build_request(req, off, *plen, num_op, ops, + ceph_osdc_build_request(req, off, num_op, ops, snapc, vino.snap, mtime); return req; -- cgit v1.2.3 From adfe695a25e92e3a4597807fbc7f9a8105218776 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 13 Mar 2013 20:50:00 -0500 Subject: ceph: move max constant definitions Move some definitions for max integer values out of the rbd code and into the more central "decode.h" header file. These really belong in a Linux (or libc) header somewhere, but I haven't gotten around to proposing that yet. This is in preparation for moving some code out of rbd.c and into the osd client. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 7 ------- include/linux/ceph/decode.h | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index dea4401c4f77..6ed508bd363a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -52,13 +52,6 @@ #define SECTOR_SHIFT 9 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) -/* It might be useful to have these defined elsewhere */ - -#define U8_MAX ((u8) (~0U)) -#define U16_MAX ((u16) (~0U)) -#define U32_MAX ((u32) (~0U)) -#define U64_MAX ((u64) (~0ULL)) - #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 360d9d08ca9e..689f1df37bff 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -8,6 +8,13 @@ #include +/* This seemed to be the easiest place to define these */ + +#define U8_MAX ((u8) (~0U)) +#define U16_MAX ((u16) (~0U)) +#define U32_MAX ((u32) (~0U)) +#define U64_MAX ((u64) (~0ULL)) + /* * in all cases, * void **p pointer to position pointer -- cgit v1.2.3 From 33803f3300265661b5c5d20a9811c6a2a157d545 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 13 Mar 2013 20:50:00 -0500 Subject: libceph: define source request op functions The rbd code has a function that allocates and populates a ceph_osd_req_op structure (the in-core version of an osd request operation). When reviewed, Josh suggested two things: that the big varargs function might be better split into type-specific functions; and that this functionality really belongs in the osd client rather than rbd. This patch implements both of Josh's suggestions. It breaks up the rbd function into separate functions and defines them in the osd client module as exported interfaces. Unlike the rbd version, however, the functions don't allocate an osd_req_op structure; they are provided the address of one and that is initialized instead. The rbd function has been eliminated and calls to it have been replaced by calls to the new routines. The rbd code now now use a stack (struct) variable to hold the op rather than allocating and freeing it each time. For now only the capabilities used by rbd are implemented. Implementing all the other osd op types, and making the rest of the code use it will be done separately, in the next few patches. Note that only the extent, cls, and watch portions of the ceph_osd_req_op structure are currently used. Delete the others (xattr, pgls, and snap) from its definition so nobody thinks it's actually implemented or needed. We can add it back again later if needed, when we know it's been tested. This (and a few follow-on patches) resolves: http://tracker.ceph.com/issues/3861 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 117 ++++++---------------------------------- include/linux/ceph/osd_client.h | 26 ++++----- net/ceph/osd_client.c | 84 +++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 116 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6ed508bd363a..f04d45b6b563 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1134,76 +1134,6 @@ static bool obj_request_type_valid(enum obj_request_type type) } } -static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) -{ - struct ceph_osd_req_op *op; - va_list args; - size_t size; - - op = kzalloc(sizeof (*op), GFP_NOIO); - if (!op) - return NULL; - op->op = opcode; - va_start(args, opcode); - switch (opcode) { - case CEPH_OSD_OP_READ: - case CEPH_OSD_OP_WRITE: - /* rbd_osd_req_op_create(READ, offset, length) */ - /* rbd_osd_req_op_create(WRITE, offset, length) */ - op->extent.offset = va_arg(args, u64); - op->extent.length = va_arg(args, u64); - if (opcode == CEPH_OSD_OP_WRITE) - op->payload_len = op->extent.length; - break; - case CEPH_OSD_OP_STAT: - break; - case CEPH_OSD_OP_CALL: - /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ - op->cls.class_name = va_arg(args, char *); - size = strlen(op->cls.class_name); - rbd_assert(size <= (size_t) U8_MAX); - op->cls.class_len = size; - op->payload_len = size; - - op->cls.method_name = va_arg(args, char *); - size = strlen(op->cls.method_name); - rbd_assert(size <= (size_t) U8_MAX); - op->cls.method_len = size; - op->payload_len += size; - - op->cls.argc = 0; - op->cls.indata = va_arg(args, void *); - size = va_arg(args, size_t); - rbd_assert(size <= (size_t) U32_MAX); - op->cls.indata_len = (u32) size; - op->payload_len += size; - break; - case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_WATCH: - /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */ - /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */ - op->watch.cookie = va_arg(args, u64); - op->watch.ver = va_arg(args, u64); - op->watch.ver = cpu_to_le64(op->watch.ver); - if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int)) - op->watch.flag = (u8) 1; - break; - default: - rbd_warn(NULL, "unsupported opcode %hu\n", opcode); - kfree(op); - op = NULL; - break; - } - va_end(args); - - return op; -} - -static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) -{ - kfree(op); -} - static int rbd_obj_request_submit(struct ceph_osd_client *osdc, struct rbd_obj_request *obj_request) { @@ -1628,7 +1558,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, while (resid) { const char *object_name; unsigned int clone_size; - struct ceph_osd_req_op *op; + struct ceph_osd_req_op op; u64 offset; u64 length; @@ -1657,13 +1587,10 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, * request. Note that the contents of the op are * copied by rbd_osd_req_create(). */ - op = rbd_osd_req_op_create(opcode, offset, length); - if (!op) - goto out_partial; + osd_req_op_extent_init(&op, opcode, offset, length, 0, 0); obj_request->osd_req = rbd_osd_req_create(rbd_dev, img_request->write_request, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request, &op); if (!obj_request->osd_req) goto out_partial; /* status and version are initially zero-filled */ @@ -1766,7 +1693,7 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; + struct ceph_osd_req_op op; struct ceph_osd_client *osdc; int ret; @@ -1776,12 +1703,9 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, return -ENOMEM; ret = -ENOMEM; - op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); - if (!op) - goto out; + osd_req_op_watch_init(&op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request, &op); if (!obj_request->osd_req) goto out; @@ -1823,7 +1747,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; + struct ceph_osd_req_op op; int ret; rbd_assert(start ^ !!rbd_dev->watch_event); @@ -1843,14 +1767,11 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) if (!obj_request) goto out_cancel; - op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, + osd_req_op_watch_init(&op, CEPH_OSD_OP_WATCH, rbd_dev->watch_event->cookie, rbd_dev->header.obj_version, start); - if (!op) - goto out_cancel; obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request, &op); if (!obj_request->osd_req) goto out_cancel; @@ -1912,7 +1833,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, { struct rbd_obj_request *obj_request; struct ceph_osd_client *osdc; - struct ceph_osd_req_op *op; + struct ceph_osd_req_op op; struct page **pages; u32 page_count; int ret; @@ -1939,13 +1860,10 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, - method_name, outbound, outbound_size); - if (!op) - goto out; + osd_req_op_cls_init(&op, CEPH_OSD_OP_CALL, class_name, method_name, + outbound, outbound_size); obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request, &op); if (!obj_request->osd_req) goto out; @@ -2125,7 +2043,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, char *buf, u64 *version) { - struct ceph_osd_req_op *op; + struct ceph_osd_req_op op; struct rbd_obj_request *obj_request; struct ceph_osd_client *osdc; struct page **pages = NULL; @@ -2147,12 +2065,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length); - if (!op) - goto out; + osd_req_op_extent_init(&op, CEPH_OSD_OP_READ, offset, length, 0, 0); obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, op); - rbd_osd_req_op_destroy(op); + obj_request, &op); if (!obj_request->osd_req) goto out; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 1dab291b2dc6..5fd2cbfcfd91 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -201,14 +201,6 @@ struct ceph_osd_req_op { u64 truncate_size; u32 truncate_seq; } extent; - struct { - const char *name; - const void *val; - u32 name_len; - u32 value_len; - __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ - __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ - } xattr; struct { const char *class_name; const char *method_name; @@ -218,13 +210,6 @@ struct ceph_osd_req_op { __u8 method_len; __u8 argc; } cls; - struct { - u64 cookie; - u64 count; - } pgls; - struct { - u64 snapid; - } snap; struct { u64 cookie; u64 ver; @@ -244,6 +229,17 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +extern void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode); +extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, + u64 offset, u64 length, + u64 truncate_size, u32 truncate_seq); +extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, + const char *class, const char *method, + const void *request_data, + size_t request_data_size); +extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, + u64 cookie, u64 version, int flag); + extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_op, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4e5c0438ea35..02ed72820479 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -289,6 +289,90 @@ static bool osd_req_opcode_valid(u16 opcode) } } +/* + * This is an osd op init function for opcodes that have no data or + * other information associated with them. It also serves as a + * common init routine for all the other init functions, below. + */ +void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode) +{ + BUG_ON(!osd_req_opcode_valid(opcode)); + + memset(op, 0, sizeof (*op)); + + op->op = opcode; +} + +void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, + u64 offset, u64 length, + u64 truncate_size, u32 truncate_seq) +{ + size_t payload_len = 0; + + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); + + osd_req_op_init(op, opcode); + + op->extent.offset = offset; + op->extent.length = length; + op->extent.truncate_size = truncate_size; + op->extent.truncate_seq = truncate_seq; + if (opcode == CEPH_OSD_OP_WRITE) + payload_len += length; + + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_extent_init); + +void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, + const char *class, const char *method, + const void *request_data, size_t request_data_size) +{ + size_t payload_len = 0; + size_t size; + + BUG_ON(opcode != CEPH_OSD_OP_CALL); + + osd_req_op_init(op, opcode); + + op->cls.class_name = class; + size = strlen(class); + BUG_ON(size > (size_t) U8_MAX); + op->cls.class_len = size; + payload_len += size; + + op->cls.method_name = method; + size = strlen(method); + BUG_ON(size > (size_t) U8_MAX); + op->cls.method_len = size; + payload_len += size; + + op->cls.indata = request_data; + BUG_ON(request_data_size > (size_t) U32_MAX); + op->cls.indata_len = (u32) request_data_size; + payload_len += request_data_size; + + op->cls.argc = 0; /* currently unused */ + + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_cls_init); + +void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, + u64 cookie, u64 version, int flag) +{ + BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); + + osd_req_op_init(op, opcode); + + op->watch.cookie = cookie; + /* op->watch.ver = version; */ /* XXX 3847 */ + op->watch.ver = cpu_to_le64(version); + if (opcode == CEPH_OSD_OP_WATCH && flag) + op->watch.flag = (u8) 1; +} +EXPORT_SYMBOL(osd_req_op_watch_init); + static u64 osd_req_encode_op(struct ceph_osd_request *req, struct ceph_osd_op *dst, struct ceph_osd_req_op *src) -- cgit v1.2.3 From fdce58ccb5df621695b079378c619046acabc778 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:06 -0500 Subject: libceph: record length of bio list with bio When assigning a bio pointer to an osd request, we don't have an efficient way of knowing the total length bytes in the bio list. That information is available at the point it's set up by the rbd code, so record it with the osd data when it's set. This and the next patch are related to maintaining the length of a message's data independent of the message header, as described here: http://tracker.ceph.com/issues/4589 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 1 + include/linux/ceph/osd_client.h | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f04d45b6b563..e95a92e89330 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1352,6 +1352,7 @@ static struct ceph_osd_request *rbd_osd_req_create( rbd_assert(obj_request->bio_list != NULL); osd_data->type = CEPH_OSD_DATA_TYPE_BIO; osd_data->bio = obj_request->bio_list; + osd_data->bio_length = obj_request->length; break; case OBJ_REQUEST_PAGES: osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3b5ba31c2cbd..fdda93ebbb4c 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -71,7 +71,10 @@ struct ceph_osd_data { }; struct ceph_pagelist *pagelist; #ifdef CONFIG_BLOCK - struct bio *bio; + struct { + struct bio *bio; /* list of bios */ + size_t bio_length; /* total in list */ + }; #endif /* CONFIG_BLOCK */ }; }; -- cgit v1.2.3 From 6010a451c38b04cf10808a508f33e5bf32e7de63 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:11 -0500 Subject: rbd: define inbound data size for method ops When rbd creates an object request containing an object method call operation it is passing 0 for the size. I originally thought this was because the length was not needed for method calls, but I think it really should be supplied, to describe how much space is available to receive response data. So provide the supplied length. This resolves: http://tracker.ceph.com/issues/4659 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e95a92e89330..afbc9f6f8ff1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1840,12 +1840,11 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, int ret; /* - * Method calls are ultimately read operations but they - * don't involve object data (so no offset or length). - * The result should placed into the inbound buffer - * provided. They also supply outbound data--parameters for - * the object method. Currently if this is present it will - * be a snapshot id. + * Method calls are ultimately read operations. The result + * should placed into the inbound buffer provided. They + * also supply outbound data--parameters for the object + * method. Currently if this is present it will be a + * snapshot id. */ page_count = (u32) calc_pages_for(0, inbound_size); pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); @@ -1853,7 +1852,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, return PTR_ERR(pages); ret = -ENOMEM; - obj_request = rbd_obj_request_create(object_name, 0, 0, + obj_request = rbd_obj_request_create(object_name, 0, inbound_size, OBJ_REQUEST_PAGES); if (!obj_request) goto out; -- cgit v1.2.3 From 43bfe5de9fa78e07248b70992ce50321efec622c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 01:28:57 -0500 Subject: libceph: define osd data initialization helpers Define and use functions that encapsulate the initializion of a ceph_osd_data structure. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 14 ++++------- fs/ceph/addr.c | 13 +++------- fs/ceph/file.c | 10 +++----- include/linux/ceph/osd_client.h | 11 +++++++++ net/ceph/osd_client.c | 55 +++++++++++++++++++++++++++++------------ 5 files changed, 63 insertions(+), 40 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index afbc9f6f8ff1..ab21b5218ae3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1350,17 +1350,13 @@ static struct ceph_osd_request *rbd_osd_req_create( break; /* Nothing to do */ case OBJ_REQUEST_BIO: rbd_assert(obj_request->bio_list != NULL); - osd_data->type = CEPH_OSD_DATA_TYPE_BIO; - osd_data->bio = obj_request->bio_list; - osd_data->bio_length = obj_request->length; + ceph_osd_data_bio_init(osd_data, obj_request->bio_list, + obj_request->length); break; case OBJ_REQUEST_PAGES: - osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; - osd_data->pages = obj_request->pages; - osd_data->length = obj_request->length; - osd_data->alignment = offset & ~PAGE_MASK; - osd_data->pages_from_pool = false; - osd_data->own_pages = false; + ceph_osd_data_pages_init(osd_data, obj_request->pages, + obj_request->length, offset & ~PAGE_MASK, + false, false); break; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5d8ce79385ed..cf9032abc8f5 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -342,10 +342,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - req->r_data_in.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data_in.pages = pages; - req->r_data_in.length = len; - req->r_data_in.alignment = 0; + ceph_osd_data_pages_init(&req->r_data_in, pages, len, 0, + false, false); req->r_callback = finish_read; req->r_inode = inode; @@ -917,11 +915,8 @@ get_more_pages: dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data_out.pages = pages; - req->r_data_out.length = len; - req->r_data_out.alignment = 0; - req->r_data_out.pages_from_pool = !!pool; + ceph_osd_data_pages_init(&req->r_data_out, pages, len, 0, + !!pool, false); pages = NULL; /* request message now owns the pages array */ pool = NULL; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 47826c2ef511..da642af14a28 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -491,6 +491,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, unsigned long buf_align; int ret; struct timespec mtime = CURRENT_TIME; + bool own_pages = false; if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; @@ -571,14 +572,11 @@ more: if ((file->f_flags & O_SYNC) == 0) { /* get a second commit callback */ req->r_safe_callback = sync_write_commit; - req->r_data_out.own_pages = 1; + own_pages = true; } } - req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES; - req->r_data_out.pages = pages; - req->r_data_out.length = len; - req->r_data_out.alignment = page_align; - req->r_inode = inode; + ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align, + false, own_pages); /* BUG_ON(vino.snap != CEPH_NOSNAP); */ ceph_osdc_build_request(req, pos, num_ops, ops, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 5ee1a3776b4b..af60dac1f9c0 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -280,6 +280,17 @@ static inline void ceph_osdc_put_request(struct ceph_osd_request *req) kref_put(&req->r_kref, ceph_osdc_release_request); } +extern void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); +extern void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, + struct ceph_pagelist *pagelist); +#ifdef CONFIG_BLOCK +extern void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, + struct bio *bio, size_t bio_length); +#endif /* CONFIG_BLOCK */ + extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, bool nofail); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1379b3313348..f8f8561b602e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -79,6 +79,38 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, return 0; } +void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, + struct page **pages, u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = pages; + osd_data->length = length; + osd_data->alignment = alignment; + osd_data->pages_from_pool = pages_from_pool; + osd_data->own_pages = own_pages; +} +EXPORT_SYMBOL(ceph_osd_data_pages_init); + +void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, + struct ceph_pagelist *pagelist) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST; + osd_data->pagelist = pagelist; +} +EXPORT_SYMBOL(ceph_osd_data_pagelist_init); + +#ifdef CONFIG_BLOCK +void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, + struct bio *bio, size_t bio_length) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_BIO; + osd_data->bio = bio; + osd_data->bio_length = bio_length; +} +EXPORT_SYMBOL(ceph_osd_data_bio_init); +#endif /* CONFIG_BLOCK */ + /* * requests */ @@ -400,8 +432,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, ceph_pagelist_append(pagelist, src->cls.indata, src->cls.indata_len); - req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGELIST; - req->r_data_out.pagelist = pagelist; + ceph_osd_data_pagelist_init(&req->r_data_out, pagelist); out_data_len = pagelist->length; break; case CEPH_OSD_OP_STARTSYNC: @@ -2056,7 +2087,6 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct page **pages, int num_pages, int page_align) { struct ceph_osd_request *req; - struct ceph_osd_data *osd_data; struct ceph_osd_req_op op; int rc = 0; @@ -2071,14 +2101,11 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, /* it may be a short read due to an object boundary */ - osd_data = &req->r_data_in; - osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; - osd_data->pages = pages; - osd_data->length = *plen; - osd_data->alignment = page_align; + ceph_osd_data_pages_init(&req->r_data_in, pages, *plen, page_align, + false, false); dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", - off, *plen, osd_data->length, page_align); + off, *plen, *plen, page_align); ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); @@ -2104,7 +2131,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, struct page **pages, int num_pages) { struct ceph_osd_request *req; - struct ceph_osd_data *osd_data; struct ceph_osd_req_op op; int rc = 0; int page_align = off & ~PAGE_MASK; @@ -2119,12 +2145,9 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, return PTR_ERR(req); /* it may be a short write due to an object boundary */ - osd_data = &req->r_data_out; - osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; - osd_data->pages = pages; - osd_data->length = len; - osd_data->alignment = page_align; - dout("writepages %llu~%llu (%llu bytes)\n", off, len, osd_data->length); + ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align, + false, false); + dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); ceph_osdc_build_request(req, off, 1, &op, snapc, CEPH_NOSNAP, mtime); -- cgit v1.2.3 From 430c28c3cb7f3dbd87de266ed52d65928957ff78 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 21:32:51 -0500 Subject: rbd: define rbd_osd_req_format_op() Define rbd_osd_req_format_op(), which encapsulates formatting an osd op into an object request's osd request message. Only one op is supported right now. Stop calling ceph_osdc_build_request() in rbd_osd_req_create(). Instead, call rbd_osd_req_format_op() in each of the callers of rbd_osd_req_create(). This is to prepare for the next patch, in which the source ops for an osd request will be held in the osd request itself. Because of that, we won't have the source op to work with until after the request is created, so we can't format the op until then. This an the next patch resolve: http://tracker.ceph.com/issues/4656 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 98 +++++++++++++++++++++++++++++------------------------ 1 file changed, 53 insertions(+), 45 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ab21b5218ae3..4a4be14a9189 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1311,29 +1311,47 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, rbd_obj_request_complete(obj_request); } +static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request, + bool write_request, + struct ceph_osd_req_op *op) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_snap_context *snapc = NULL; + u64 snap_id = CEPH_NOSNAP; + struct timespec *mtime = NULL; + struct timespec now; + + rbd_assert(obj_request->osd_req != NULL); + + if (write_request) { + now = CURRENT_TIME; + mtime = &now; + if (img_request) + snapc = img_request->snapc; + } else if (img_request) { + snap_id = img_request->snap_id; + } + + ceph_osdc_build_request(obj_request->osd_req, obj_request->offset, + 1, op, snapc, snap_id, mtime); +} + static struct ceph_osd_request *rbd_osd_req_create( struct rbd_device *rbd_dev, bool write_request, - struct rbd_obj_request *obj_request, - struct ceph_osd_req_op *op) + struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request = obj_request->img_request; struct ceph_snap_context *snapc = NULL; struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; struct ceph_osd_data *osd_data; - struct timespec now; - struct timespec *mtime; - u64 snap_id = CEPH_NOSNAP; u64 offset = obj_request->offset; - u64 length = obj_request->length; if (img_request) { rbd_assert(img_request->write_request == write_request); if (img_request->write_request) snapc = img_request->snapc; - else - snap_id = img_request->snap_id; } /* Allocate and initialize the request, for the single op */ @@ -1360,16 +1378,10 @@ static struct ceph_osd_request *rbd_osd_req_create( break; } - if (write_request) { + if (write_request) osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; - now = CURRENT_TIME; - mtime = &now; - } else { + else osd_req->r_flags = CEPH_OSD_FLAG_READ; - mtime = NULL; /* not needed for reads */ - offset = 0; /* These are not used... */ - length = 0; /* ...for osd read requests */ - } osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; @@ -1380,11 +1392,6 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_file_layout = rbd_dev->layout; /* struct */ - /* osd_req will get its own reference to snapc (if non-null) */ - - ceph_osdc_build_request(osd_req, offset, 1, op, - snapc, snap_id, mtime); - return osd_req; } @@ -1538,6 +1545,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, struct rbd_device *rbd_dev = img_request->rbd_dev; struct rbd_obj_request *obj_request = NULL; struct rbd_obj_request *next_obj_request; + bool write_request = img_request->write_request; unsigned int bio_offset; u64 image_offset; u64 resid; @@ -1545,8 +1553,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, dout("%s: img %p bio %p\n", __func__, img_request, bio_list); - opcode = img_request->write_request ? CEPH_OSD_OP_WRITE - : CEPH_OSD_OP_READ; + opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; bio_offset = 0; image_offset = img_request->offset; rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT); @@ -1579,17 +1586,14 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, if (!obj_request->bio_list) goto out_partial; - /* - * Build up the op to use in building the osd - * request. Note that the contents of the op are - * copied by rbd_osd_req_create(). - */ - osd_req_op_extent_init(&op, opcode, offset, length, 0, 0); obj_request->osd_req = rbd_osd_req_create(rbd_dev, - img_request->write_request, - obj_request, &op); + write_request, obj_request); if (!obj_request->osd_req) goto out_partial; + + osd_req_op_extent_init(&op, opcode, offset, length, 0, 0); + rbd_osd_req_format_op(obj_request, write_request, &op); + /* status and version are initially zero-filled */ rbd_img_obj_request_add(img_request, obj_request); @@ -1700,12 +1704,13 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, return -ENOMEM; ret = -ENOMEM; - osd_req_op_watch_init(&op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, &op); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); if (!obj_request->osd_req) goto out; + osd_req_op_watch_init(&op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); + rbd_osd_req_format_op(obj_request, false, &op); + osdc = &rbd_dev->rbd_client->client->osdc; obj_request->callback = rbd_obj_request_put; ret = rbd_obj_request_submit(osdc, obj_request); @@ -1764,13 +1769,14 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) if (!obj_request) goto out_cancel; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); + if (!obj_request->osd_req) + goto out_cancel; + osd_req_op_watch_init(&op, CEPH_OSD_OP_WATCH, rbd_dev->watch_event->cookie, rbd_dev->header.obj_version, start); - obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, - obj_request, &op); - if (!obj_request->osd_req) - goto out_cancel; + rbd_osd_req_format_op(obj_request, true, &op); if (start) ceph_osdc_set_request_linger(osdc, obj_request->osd_req); @@ -1856,13 +1862,14 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - osd_req_op_cls_init(&op, CEPH_OSD_OP_CALL, class_name, method_name, - outbound, outbound_size); - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, &op); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); if (!obj_request->osd_req) goto out; + osd_req_op_cls_init(&op, CEPH_OSD_OP_CALL, class_name, method_name, + outbound, outbound_size); + rbd_osd_req_format_op(obj_request, false, &op); + osdc = &rbd_dev->rbd_client->client->osdc; ret = rbd_obj_request_submit(osdc, obj_request); if (ret) @@ -2061,12 +2068,13 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - osd_req_op_extent_init(&op, CEPH_OSD_OP_READ, offset, length, 0, 0); - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, - obj_request, &op); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); if (!obj_request->osd_req) goto out; + osd_req_op_extent_init(&op, CEPH_OSD_OP_READ, offset, length, 0, 0); + rbd_osd_req_format_op(obj_request, false, &op); + osdc = &rbd_dev->rbd_client->client->osdc; ret = rbd_obj_request_submit(osdc, obj_request); if (ret) -- cgit v1.2.3 From 79528734f3ae4699a2886f62f55e18fb34fb3651 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 21:32:51 -0500 Subject: libceph: keep source rather than message osd op array An osd request keeps a pointer to the osd operations (ops) array that it builds in its request message. In order to allow each op in the array to have its own distinct data, we will need to keep track of each op's data, and that information does not go over the wire. As long as we're tracking the data we might as well just track the entire (source) op definition for each of the ops. And if we're doing that, we'll have no more need to keep a pointer to the wire-encoded version. This patch makes the array of source ops be kept with the osd request structure, and uses that instead of the version encoded in the message in places where that was previously used. The array will be embedded in the request structure, and the maximum number of ops we ever actually use is currently 2. So reduce CEPH_OSD_MAX_OP to 2 to reduce the size of the structure. The result of doing this sort of ripples back up, and as a result various function parameters and local variables become unnecessary. Make r_num_ops be unsigned, and move the definition of struct ceph_osd_req_op earlier to ensure it's defined where needed. It does not yet add per-op data, that's coming soon. This resolves: http://tracker.ceph.com/issues/4656 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 42 ++++++++++++++----------- fs/ceph/addr.c | 21 ++++++------- fs/ceph/file.c | 6 ++-- include/linux/ceph/osd_client.h | 70 ++++++++++++++++++++--------------------- net/ceph/debugfs.c | 4 +-- net/ceph/osd_client.c | 53 ++++++++++++++++--------------- 6 files changed, 97 insertions(+), 99 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4a4be14a9189..c12b55559f16 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1285,7 +1285,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, */ obj_request->xferred = osd_req->r_reply_op_len[0]; rbd_assert(obj_request->xferred < (u64) UINT_MAX); - opcode = osd_req->r_request_ops[0].op; + opcode = osd_req->r_ops[0].op; switch (opcode) { case CEPH_OSD_OP_READ: rbd_osd_read_callback(obj_request); @@ -1312,8 +1312,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, } static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request, - bool write_request, - struct ceph_osd_req_op *op) + bool write_request) { struct rbd_img_request *img_request = obj_request->img_request; struct ceph_snap_context *snapc = NULL; @@ -1333,7 +1332,7 @@ static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request, } ceph_osdc_build_request(obj_request->osd_req, obj_request->offset, - 1, op, snapc, snap_id, mtime); + snapc, snap_id, mtime); } static struct ceph_osd_request *rbd_osd_req_create( @@ -1562,7 +1561,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, while (resid) { const char *object_name; unsigned int clone_size; - struct ceph_osd_req_op op; + struct ceph_osd_req_op *op; u64 offset; u64 length; @@ -1591,8 +1590,9 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, if (!obj_request->osd_req) goto out_partial; - osd_req_op_extent_init(&op, opcode, offset, length, 0, 0); - rbd_osd_req_format_op(obj_request, write_request, &op); + op = &obj_request->osd_req->r_ops[0]; + osd_req_op_extent_init(op, opcode, offset, length, 0, 0); + rbd_osd_req_format_op(obj_request, write_request); /* status and version are initially zero-filled */ @@ -1694,7 +1694,7 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { struct rbd_obj_request *obj_request; - struct ceph_osd_req_op op; + struct ceph_osd_req_op *op; struct ceph_osd_client *osdc; int ret; @@ -1708,8 +1708,9 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - osd_req_op_watch_init(&op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); - rbd_osd_req_format_op(obj_request, false, &op); + op = &obj_request->osd_req->r_ops[0]; + osd_req_op_watch_init(op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); + rbd_osd_req_format_op(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; obj_request->callback = rbd_obj_request_put; @@ -1749,7 +1750,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_req_op op; + struct ceph_osd_req_op *op; int ret; rbd_assert(start ^ !!rbd_dev->watch_event); @@ -1773,10 +1774,11 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) if (!obj_request->osd_req) goto out_cancel; - osd_req_op_watch_init(&op, CEPH_OSD_OP_WATCH, + op = &obj_request->osd_req->r_ops[0]; + osd_req_op_watch_init(op, CEPH_OSD_OP_WATCH, rbd_dev->watch_event->cookie, rbd_dev->header.obj_version, start); - rbd_osd_req_format_op(obj_request, true, &op); + rbd_osd_req_format_op(obj_request, true); if (start) ceph_osdc_set_request_linger(osdc, obj_request->osd_req); @@ -1836,7 +1838,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, { struct rbd_obj_request *obj_request; struct ceph_osd_client *osdc; - struct ceph_osd_req_op op; + struct ceph_osd_req_op *op; struct page **pages; u32 page_count; int ret; @@ -1866,9 +1868,10 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - osd_req_op_cls_init(&op, CEPH_OSD_OP_CALL, class_name, method_name, + op = &obj_request->osd_req->r_ops[0]; + osd_req_op_cls_init(op, CEPH_OSD_OP_CALL, class_name, method_name, outbound, outbound_size); - rbd_osd_req_format_op(obj_request, false, &op); + rbd_osd_req_format_op(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; ret = rbd_obj_request_submit(osdc, obj_request); @@ -2046,8 +2049,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, char *buf, u64 *version) { - struct ceph_osd_req_op op; struct rbd_obj_request *obj_request; + struct ceph_osd_req_op *op; struct ceph_osd_client *osdc; struct page **pages = NULL; u32 page_count; @@ -2072,8 +2075,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - osd_req_op_extent_init(&op, CEPH_OSD_OP_READ, offset, length, 0, 0); - rbd_osd_req_format_op(obj_request, false, &op); + op = &obj_request->osd_req->r_ops[0]; + osd_req_op_extent_init(op, CEPH_OSD_OP_READ, offset, length, 0, 0); + rbd_osd_req_format_op(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; ret = rbd_obj_request_submit(osdc, obj_request); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 127be29a6c22..c9da074f0fe6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -288,7 +288,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) struct page *page = list_entry(page_list->prev, struct page, lru); struct ceph_vino vino; struct ceph_osd_request *req; - struct ceph_osd_req_op op; u64 off; u64 len; int i; @@ -314,7 +313,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) off, len); vino = ceph_vino(inode); req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, - 1, &op, CEPH_OSD_OP_READ, + 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); @@ -349,7 +348,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) req->r_callback = finish_read; req->r_inode = inode; - ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); @@ -567,7 +566,7 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; int rc = req->r_result; - u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); + u64 bytes = req->r_ops[0].extent.length; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); @@ -635,8 +634,7 @@ static void writepages_finish(struct ceph_osd_request *req, static struct ceph_osd_request * ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len, - struct ceph_snap_context *snapc, - int num_ops, struct ceph_osd_req_op *ops) + struct ceph_snap_context *snapc, int num_ops) { struct ceph_fs_client *fsc; struct ceph_inode_info *ci; @@ -648,7 +646,7 @@ ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len, /* BUG_ON(vino.snap != CEPH_NOSNAP); */ return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, offset, len, num_ops, ops, CEPH_OSD_OP_WRITE, + vino, offset, len, num_ops, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK, snapc, ci->i_truncate_seq, ci->i_truncate_size, true); } @@ -738,7 +736,6 @@ retry: last_snapc = snapc; while (!done && index <= end) { - struct ceph_osd_req_op ops[2]; int num_ops = do_sync ? 2 : 1; struct ceph_vino vino; unsigned i; @@ -846,7 +843,7 @@ get_more_pages: len = wsize; req = ceph_writepages_osd_request(inode, offset, &len, snapc, - num_ops, ops); + num_ops); if (IS_ERR(req)) { rc = PTR_ERR(req); @@ -927,11 +924,11 @@ get_more_pages: /* Update the write op length in case we changed it */ - osd_req_op_extent_update(&ops[0], len); + osd_req_op_extent_update(&req->r_ops[0], len); vino = ceph_vino(inode); - ceph_osdc_build_request(req, offset, num_ops, ops, - snapc, vino.snap, &inode->i_mtime); + ceph_osdc_build_request(req, offset, snapc, vino.snap, + &inode->i_mtime); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index da642af14a28..a12f47642c40 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -478,7 +478,6 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; - struct ceph_osd_req_op ops[2]; int num_ops = 1; struct page **pages; int num_pages; @@ -534,7 +533,7 @@ more: snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, num_ops, ops, + vino, pos, &len, num_ops, CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, false); @@ -579,8 +578,7 @@ more: false, own_pages); /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - ceph_osdc_build_request(req, pos, num_ops, ops, - snapc, vino.snap, &mtime); + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index af60dac1f9c0..f4c1a2a22a14 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -48,7 +48,7 @@ struct ceph_osd { }; -#define CEPH_OSD_MAX_OP 10 +#define CEPH_OSD_MAX_OP 2 enum ceph_osd_data_type { CEPH_OSD_DATA_TYPE_NONE, @@ -79,6 +79,34 @@ struct ceph_osd_data { }; }; +struct ceph_osd_req_op { + u16 op; /* CEPH_OSD_OP_* */ + u32 payload_len; + union { + struct { + u64 offset, length; + u64 truncate_size; + u32 truncate_seq; + } extent; + struct { + const char *class_name; + const char *method_name; + const void *indata; + u32 indata_len; + __u8 class_len; + __u8 method_len; + __u8 argc; + } cls; + struct { + u64 cookie; + u64 ver; + u32 prot_ver; + u32 timeout; + __u8 flag; + } watch; + }; +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ @@ -95,10 +123,11 @@ struct ceph_osd_request { struct ceph_msg *r_request, *r_reply; int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ - int r_num_ops; - /* encoded message content */ - struct ceph_osd_op *r_request_ops; + /* request osd ops array */ + unsigned int r_num_ops; + struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; + /* these are updated on each send */ __le32 *r_request_osdmap_epoch; __le32 *r_request_flags; @@ -193,34 +222,6 @@ struct ceph_osd_client { struct workqueue_struct *notify_wq; }; -struct ceph_osd_req_op { - u16 op; /* CEPH_OSD_OP_* */ - u32 payload_len; - union { - struct { - u64 offset, length; - u64 truncate_size; - u32 truncate_seq; - } extent; - struct { - const char *class_name; - const char *method_name; - const void *indata; - u32 indata_len; - __u8 class_len; - __u8 method_len; - __u8 argc; - } cls; - struct { - u64 cookie; - u64 ver; - u32 prot_ver; - u32 timeout; - __u8 flag; - } watch; - }; -}; - extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); @@ -249,8 +250,6 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - unsigned int num_ops, - struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime); @@ -259,8 +258,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, u64 offset, u64 *len, - int num_ops, struct ceph_osd_req_op *ops, - int opcode, int flags, + int num_ops, int opcode, int flags, struct ceph_snap_context *snapc, u32 truncate_seq, u64 truncate_size, bool use_mempool); diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 00d051f4894e..83661cdc0766 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -123,8 +123,8 @@ static int osdc_show(struct seq_file *s, void *pp) mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { struct ceph_osd_request *req; + unsigned int i; int opcode; - int i; req = rb_entry(p, struct ceph_osd_request, r_node); @@ -142,7 +142,7 @@ static int osdc_show(struct seq_file *s, void *pp) seq_printf(s, "\t"); for (i = 0; i < req->r_num_ops; i++) { - opcode = le16_to_cpu(req->r_request_ops[i].op); + opcode = req->r_ops[i].op; seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index e197c5c0b3a2..a498d2de17a4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -186,6 +186,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_msg *msg; size_t msg_size; + BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); + BUG_ON(num_ops > CEPH_OSD_MAX_OP); + msg_size = 4 + 4 + 8 + 8 + 4+8; msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pg_t */ @@ -207,6 +210,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req->r_osdc = osdc; req->r_mempool = use_mempool; + req->r_num_ops = num_ops; kref_init(&req->r_kref); init_completion(&req->r_completion); @@ -418,12 +422,14 @@ void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, EXPORT_SYMBOL(osd_req_op_watch_init); static u64 osd_req_encode_op(struct ceph_osd_request *req, - struct ceph_osd_op *dst, - struct ceph_osd_req_op *src) + struct ceph_osd_op *dst, unsigned int which) { + struct ceph_osd_req_op *src; u64 out_data_len = 0; struct ceph_pagelist *pagelist; + BUG_ON(which >= req->r_num_ops); + src = &req->r_ops[which]; if (WARN_ON(!osd_req_opcode_valid(src->op))) { pr_err("unrecognized osd opcode %d\n", src->op); @@ -487,21 +493,17 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, * build new request AND message * */ -void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, unsigned int num_ops, - struct ceph_osd_req_op *src_ops, - struct ceph_snap_context *snapc, u64 snap_id, - struct timespec *mtime) +void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, + struct ceph_snap_context *snapc, u64 snap_id, + struct timespec *mtime) { struct ceph_msg *msg = req->r_request; - struct ceph_osd_req_op *src_op; void *p; size_t msg_size; int flags = req->r_flags; u64 data_len; - int i; + unsigned int i; - req->r_num_ops = num_ops; req->r_snapid = snap_id; req->r_snapc = ceph_get_snap_context(snapc); @@ -541,12 +543,10 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, p += req->r_oid_len; /* ops--can imply data */ - ceph_encode_16(&p, num_ops); - src_op = src_ops; - req->r_request_ops = p; + ceph_encode_16(&p, (u16)req->r_num_ops); data_len = 0; - for (i = 0; i < num_ops; i++, src_op++) { - data_len += osd_req_encode_op(req, p, src_op); + for (i = 0; i < req->r_num_ops; i++) { + data_len += osd_req_encode_op(req, p, i); p += sizeof(struct ceph_osd_op); } @@ -602,7 +602,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, struct ceph_vino vino, u64 off, u64 *plen, int num_ops, - struct ceph_osd_req_op *ops, int opcode, int flags, struct ceph_snap_context *snapc, u32 truncate_seq, @@ -610,6 +609,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, bool use_mempool) { struct ceph_osd_request *req; + struct ceph_osd_req_op *op; u64 objnum = 0; u64 objoff = 0; u64 objlen = 0; @@ -623,6 +623,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); + req->r_flags = flags; /* calculate max write size */ @@ -642,7 +643,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size = object_size; } - osd_req_op_extent_init(&ops[0], opcode, objoff, objlen, + op = &req->r_ops[0]; + osd_req_op_extent_init(op, opcode, objoff, objlen, truncate_size, truncate_seq); /* * A second op in the ops array means the caller wants to @@ -650,7 +652,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, * osd will flush data quickly. */ if (num_ops > 1) - osd_req_op_init(&ops[1], CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(++op, CEPH_OSD_OP_STARTSYNC); req->r_file_layout = *layout; /* keep a copy */ @@ -1342,7 +1344,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, struct ceph_osd_request *req; u64 tid; int object_len; - int numops, payload_len, flags; + unsigned int numops; + int payload_len, flags; s32 result; s32 retry_attempt; struct ceph_pg pg; @@ -1352,7 +1355,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, u32 osdmap_epoch; int already_completed; u32 bytes; - int i; + unsigned int i; tid = le64_to_cpu(msg->hdr.tid); dout("handle_reply %p tid %llu\n", msg, tid); @@ -2116,12 +2119,11 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct page **pages, int num_pages, int page_align) { struct ceph_osd_request *req; - struct ceph_osd_req_op op; int rc = 0; dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, vino.snap, off, *plen); - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, &op, + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, truncate_seq, truncate_size, false); @@ -2136,7 +2138,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", off, *plen, *plen, page_align); - ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL); + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -2160,12 +2162,11 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, struct page **pages, int num_pages) { struct ceph_osd_request *req; - struct ceph_osd_req_op op; int rc = 0; int page_align = off & ~PAGE_MASK; BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, &op, + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, snapc, truncate_seq, truncate_size, @@ -2178,7 +2179,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, false, false); dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); - ceph_osdc_build_request(req, off, 1, &op, snapc, CEPH_NOSNAP, mtime); + ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); rc = ceph_osdc_start_request(osdc, req, true); if (!rc) -- cgit v1.2.3 From 8c042b0df99cd06ef8473ef6e204b87b3dc80158 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 01:28:58 -0500 Subject: libceph: add data pointers in osd op structures An extent type osd operation currently implies that there will be corresponding data supplied in the data portion of the request (for write) or response (for read) message. Similarly, an osd class method operation implies a data item will be supplied to receive the response data from the operation. Add a ceph_osd_data pointer to each of those structures, and assign it to point to eithre the incoming or the outgoing data structure in the osd message. The data is not always available when an op is initially set up, so add two new functions to allow setting them after the op has been initialized. Begin to make use of the data item pointer available in the osd operation rather than the request data in or out structure in places where it's convenient. Add some assertions to verify pointers are always set the way they're expected to be. This is a sort of stepping stone toward really moving the data into the osd request ops, to allow for some validation before making that jump. This is the first in a series of patches that resolve: http://tracker.ceph.com/issues/4657 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 24 ++++++++++++++++++++---- fs/ceph/addr.c | 8 +++++--- fs/ceph/file.c | 5 +++-- include/linux/ceph/osd_client.h | 6 ++++++ net/ceph/osd_client.c | 26 +++++++++++++++++++++++++- 5 files changed, 59 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c12b55559f16..eb64ed0f228f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1315,23 +1315,39 @@ static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request, bool write_request) { struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_osd_request *osd_req = obj_request->osd_req; + struct ceph_osd_data *osd_data = NULL; struct ceph_snap_context *snapc = NULL; u64 snap_id = CEPH_NOSNAP; struct timespec *mtime = NULL; struct timespec now; - rbd_assert(obj_request->osd_req != NULL); + rbd_assert(osd_req != NULL); if (write_request) { + osd_data = &osd_req->r_data_out; now = CURRENT_TIME; mtime = &now; if (img_request) snapc = img_request->snapc; - } else if (img_request) { - snap_id = img_request->snap_id; + } else { + osd_data = &osd_req->r_data_in; + if (img_request) + snap_id = img_request->snap_id; } + if (obj_request->type != OBJ_REQUEST_NODATA) { + struct ceph_osd_req_op *op = &obj_request->osd_req->r_ops[0]; - ceph_osdc_build_request(obj_request->osd_req, obj_request->offset, + /* + * If it has data, it's either a object class method + * call (cls) or it's an extent operation. + */ + if (op->op == CEPH_OSD_OP_CALL) + osd_req_op_cls_response_data(op, osd_data); + else + osd_req_op_extent_osd_data(op, osd_data); + } + ceph_osdc_build_request(osd_req, obj_request->offset, snapc, snap_id, mtime); } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c9da074f0fe6..0ac3a37753cb 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -343,7 +343,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - ceph_osd_data_pages_init(&req->r_data_in, pages, len, 0, + BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_in); + ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len, 0, false, false); req->r_callback = finish_read; req->r_inode = inode; @@ -916,8 +917,9 @@ get_more_pages: dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - ceph_osd_data_pages_init(&req->r_data_out, pages, len, 0, - !!pool, false); + BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out); + ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, + len, 0, !!pool, false); pages = NULL; /* request message now owns the pages array */ pool = NULL; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a12f47642c40..cddc10fd7cf9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -574,8 +574,9 @@ more: own_pages = true; } } - ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align, - false, own_pages); + BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out); + ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len, + page_align, false, own_pages); /* BUG_ON(vino.snap != CEPH_NOSNAP); */ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index a9c4089894c8..ae5193550fbf 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -87,12 +87,14 @@ struct ceph_osd_req_op { u64 offset, length; u64 truncate_size; u32 truncate_seq; + struct ceph_osd_data *osd_data; } extent; struct { const char *class_name; const char *method_name; const void *request_data; u32 request_data_len; + struct ceph_osd_data *response_data; __u8 class_len; __u8 method_len; __u8 argc; @@ -236,10 +238,14 @@ extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq); extern void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length); +extern void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op, + struct ceph_osd_data *osd_data); extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size); +extern void osd_req_op_cls_response_data(struct ceph_osd_req_op *op, + struct ceph_osd_data *response_data); extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, u64 cookie, u64 version, int flag); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 87fcf0b795c0..23491e92b229 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -372,6 +372,13 @@ void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length) } EXPORT_SYMBOL(osd_req_op_extent_update); +void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op, + struct ceph_osd_data *osd_data) +{ + op->extent.osd_data = osd_data; +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data); + void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size) @@ -406,6 +413,13 @@ void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, } EXPORT_SYMBOL(osd_req_op_cls_init); +void osd_req_op_cls_response_data(struct ceph_osd_req_op *op, + struct ceph_osd_data *response_data) +{ + op->cls.response_data = response_data; +} +EXPORT_SYMBOL(osd_req_op_cls_response_data); + void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, u64 cookie, u64 version, int flag) { @@ -449,6 +463,10 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); + if (src->op == CEPH_OSD_OP_WRITE) + WARN_ON(src->extent.osd_data != &req->r_data_out); + else + WARN_ON(src->extent.osd_data != &req->r_data_in); break; case CEPH_OSD_OP_CALL: pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); @@ -464,8 +482,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, src->cls.method_len); ceph_pagelist_append(pagelist, src->cls.request_data, src->cls.request_data_len); - ceph_osd_data_pagelist_init(&req->r_data_out, pagelist); + + WARN_ON(src->cls.response_data != &req->r_data_in); request_data_len = pagelist->length; break; case CEPH_OSD_OP_STARTSYNC: @@ -609,6 +628,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, bool use_mempool) { struct ceph_osd_request *req; + struct ceph_osd_data *osd_data; struct ceph_osd_req_op *op; u64 objnum = 0; u64 objoff = 0; @@ -623,6 +643,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); + osd_data = opcode == CEPH_OSD_OP_WRITE ? &req->r_data_out + : &req->r_data_in; req->r_flags = flags; @@ -646,6 +668,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, op = &req->r_ops[0]; osd_req_op_extent_init(op, opcode, objoff, objlen, truncate_size, truncate_seq); + osd_req_op_extent_osd_data(op, osd_data); + /* * A second op in the ops array means the caller wants to * also issue a include a 'startsync' command so that the -- cgit v1.2.3 From c99d2d4abb6c405ef52e9bc1da87b382b8f41739 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:11 -0500 Subject: libceph: specify osd op by index in request An osd request now holds all of its source op structures, and every place that initializes one of these is in fact initializing one of the entries in the the osd request's array. So rather than supplying the address of the op to initialize, have caller specify the osd request and an indication of which op it would like to initialize. This better hides the details the op structure (and faciltates moving the data pointers they use). Since osd_req_op_init() is a common routine, and it's not used outside the osd client code, give it static scope. Also make it return the address of the specified op (so all the other init routines don't have to repeat that code). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 35 ++++++++++------------ fs/ceph/addr.c | 2 +- include/linux/ceph/osd_client.h | 19 +++++++----- net/ceph/osd_client.c | 64 +++++++++++++++++++++++++---------------- 4 files changed, 67 insertions(+), 53 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index eb64ed0f228f..80ac772587c8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1336,16 +1336,17 @@ static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request, snap_id = img_request->snap_id; } if (obj_request->type != OBJ_REQUEST_NODATA) { - struct ceph_osd_req_op *op = &obj_request->osd_req->r_ops[0]; - /* * If it has data, it's either a object class method * call (cls) or it's an extent operation. */ - if (op->op == CEPH_OSD_OP_CALL) - osd_req_op_cls_response_data(op, osd_data); + /* XXX This use of the ops array goes away in the next patch */ + if (obj_request->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL) + osd_req_op_cls_response_data(obj_request->osd_req, 0, + osd_data); else - osd_req_op_extent_osd_data(op, osd_data); + osd_req_op_extent_osd_data(obj_request->osd_req, 0, + osd_data); } ceph_osdc_build_request(osd_req, obj_request->offset, snapc, snap_id, mtime); @@ -1577,7 +1578,6 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, while (resid) { const char *object_name; unsigned int clone_size; - struct ceph_osd_req_op *op; u64 offset; u64 length; @@ -1606,8 +1606,8 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, if (!obj_request->osd_req) goto out_partial; - op = &obj_request->osd_req->r_ops[0]; - osd_req_op_extent_init(op, opcode, offset, length, 0, 0); + osd_req_op_extent_init(obj_request->osd_req, 0, + opcode, offset, length, 0, 0); rbd_osd_req_format_op(obj_request, write_request); /* status and version are initially zero-filled */ @@ -1710,7 +1710,6 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; struct ceph_osd_client *osdc; int ret; @@ -1724,8 +1723,8 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - op = &obj_request->osd_req->r_ops[0]; - osd_req_op_watch_init(op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, + notify_id, ver, 0); rbd_osd_req_format_op(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; @@ -1766,7 +1765,6 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; int ret; rbd_assert(start ^ !!rbd_dev->watch_event); @@ -1790,8 +1788,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) if (!obj_request->osd_req) goto out_cancel; - op = &obj_request->osd_req->r_ops[0]; - osd_req_op_watch_init(op, CEPH_OSD_OP_WATCH, + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, rbd_dev->watch_event->cookie, rbd_dev->header.obj_version, start); rbd_osd_req_format_op(obj_request, true); @@ -1854,7 +1851,6 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, { struct rbd_obj_request *obj_request; struct ceph_osd_client *osdc; - struct ceph_osd_req_op *op; struct page **pages; u32 page_count; int ret; @@ -1884,8 +1880,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - op = &obj_request->osd_req->r_ops[0]; - osd_req_op_cls_init(op, CEPH_OSD_OP_CALL, class_name, method_name, + osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, + class_name, method_name, outbound, outbound_size); rbd_osd_req_format_op(obj_request, false); @@ -2066,7 +2062,6 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, { struct rbd_obj_request *obj_request; - struct ceph_osd_req_op *op; struct ceph_osd_client *osdc; struct page **pages = NULL; u32 page_count; @@ -2091,8 +2086,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - op = &obj_request->osd_req->r_ops[0]; - osd_req_op_extent_init(op, CEPH_OSD_OP_READ, offset, length, 0, 0); + osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, + offset, length, 0, 0); rbd_osd_req_format_op(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 0ac3a37753cb..cc57104a7266 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -926,7 +926,7 @@ get_more_pages: /* Update the write op length in case we changed it */ - osd_req_op_extent_update(&req->r_ops[0], len); + osd_req_op_extent_update(req, 0, len); vino = ceph_vino(inode); ceph_osdc_build_request(req, offset, snapc, vino.snap, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ae5193550fbf..144d57cbef9e 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -233,20 +233,25 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); -extern void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode); -extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, +extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq); -extern void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length); -extern void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op, +extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, + unsigned int which, u64 length); +extern void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_osd_data *osd_data); -extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, +extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size); -extern void osd_req_op_cls_response_data(struct ceph_osd_req_op *op, +extern void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_osd_data *response_data); -extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, +extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, u64 cookie, u64 version, int flag); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 23491e92b229..ad24f210bf0c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -329,25 +329,32 @@ static bool osd_req_opcode_valid(u16 opcode) * other information associated with them. It also serves as a * common init routine for all the other init functions, below. */ -void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode) +static struct ceph_osd_req_op * +osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode) { + struct ceph_osd_req_op *op; + + BUG_ON(which >= osd_req->r_num_ops); BUG_ON(!osd_req_opcode_valid(opcode)); + op = &osd_req->r_ops[which]; memset(op, 0, sizeof (*op)); - op->op = opcode; + + return op; } -void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, +void osd_req_op_extent_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq) { + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); size_t payload_len = 0; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); - osd_req_op_init(op, opcode); - op->extent.offset = offset; op->extent.length = length; op->extent.truncate_size = truncate_size; @@ -359,9 +366,15 @@ void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, } EXPORT_SYMBOL(osd_req_op_extent_init); -void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length) +void osd_req_op_extent_update(struct ceph_osd_request *osd_req, + unsigned int which, u64 length) { - u64 previous = op->extent.length; + struct ceph_osd_req_op *op; + u64 previous; + + BUG_ON(which >= osd_req->r_num_ops); + op = &osd_req->r_ops[which]; + previous = op->extent.length; if (length == previous) return; /* Nothing to do */ @@ -372,24 +385,25 @@ void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length) } EXPORT_SYMBOL(osd_req_op_extent_update); -void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op, +void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_osd_data *osd_data) { - op->extent.osd_data = osd_data; + BUG_ON(which >= osd_req->r_num_ops); + osd_req->r_ops[which].extent.osd_data = osd_data; } EXPORT_SYMBOL(osd_req_op_extent_osd_data); -void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, - const char *class, const char *method, +void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size) { + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); size_t payload_len = 0; size_t size; BUG_ON(opcode != CEPH_OSD_OP_CALL); - osd_req_op_init(op, opcode); - op->cls.class_name = class; size = strlen(class); BUG_ON(size > (size_t) U8_MAX); @@ -412,26 +426,28 @@ void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, op->payload_len = payload_len; } EXPORT_SYMBOL(osd_req_op_cls_init); - -void osd_req_op_cls_response_data(struct ceph_osd_req_op *op, +void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_osd_data *response_data) { - op->cls.response_data = response_data; + BUG_ON(which >= osd_req->r_num_ops); + osd_req->r_ops[which].cls.response_data = response_data; } EXPORT_SYMBOL(osd_req_op_cls_response_data); -void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, +void osd_req_op_watch_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, u64 cookie, u64 version, int flag) { - BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); - osd_req_op_init(op, opcode); + BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); op->watch.cookie = cookie; /* op->watch.ver = version; */ /* XXX 3847 */ op->watch.ver = cpu_to_le64(version); if (opcode == CEPH_OSD_OP_WATCH && flag) - op->watch.flag = (u8) 1; + op->watch.flag = (u8)1; } EXPORT_SYMBOL(osd_req_op_watch_init); @@ -629,7 +645,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, { struct ceph_osd_request *req; struct ceph_osd_data *osd_data; - struct ceph_osd_req_op *op; u64 objnum = 0; u64 objoff = 0; u64 objlen = 0; @@ -665,10 +680,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size = object_size; } - op = &req->r_ops[0]; - osd_req_op_extent_init(op, opcode, objoff, objlen, + osd_req_op_extent_init(req, 0, opcode, objoff, objlen, truncate_size, truncate_seq); - osd_req_op_extent_osd_data(op, osd_data); + osd_req_op_extent_osd_data(req, 0, osd_data); /* * A second op in the ops array means the caller wants to @@ -676,7 +690,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, * osd will flush data quickly. */ if (num_ops > 1) - osd_req_op_init(++op, CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); req->r_file_layout = *layout; /* keep a copy */ -- cgit v1.2.3 From 2fa123201a86ff979813e24f9e5c5fa54931ab7f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: rbd: don't set data in rbd_osd_req_format_op() Currently an object request has its osd request's data field set in rbd_osd_req_format_op(). That assumes a single osd op per object request, and that won't be the case for long. Move the code that sets this out and into the caller. Rename rbd_osd_req_format_op() to be just rbd_osd_req_format(), removing the notion that it's doing anything op-specific. This and the next patch resolve: http://tracker.ceph.com/issues/4658 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 55 ++++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 30 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 80ac772587c8..06912abca601 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1311,12 +1311,11 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, rbd_obj_request_complete(obj_request); } -static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request, +static void rbd_osd_req_format(struct rbd_obj_request *obj_request, bool write_request) { struct rbd_img_request *img_request = obj_request->img_request; struct ceph_osd_request *osd_req = obj_request->osd_req; - struct ceph_osd_data *osd_data = NULL; struct ceph_snap_context *snapc = NULL; u64 snap_id = CEPH_NOSNAP; struct timespec *mtime = NULL; @@ -1325,28 +1324,12 @@ static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request, rbd_assert(osd_req != NULL); if (write_request) { - osd_data = &osd_req->r_data_out; now = CURRENT_TIME; mtime = &now; if (img_request) snapc = img_request->snapc; - } else { - osd_data = &osd_req->r_data_in; - if (img_request) - snap_id = img_request->snap_id; - } - if (obj_request->type != OBJ_REQUEST_NODATA) { - /* - * If it has data, it's either a object class method - * call (cls) or it's an extent operation. - */ - /* XXX This use of the ops array goes away in the next patch */ - if (obj_request->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL) - osd_req_op_cls_response_data(obj_request->osd_req, 0, - osd_data); - else - osd_req_op_extent_osd_data(obj_request->osd_req, 0, - osd_data); + } else if (img_request) { + snap_id = img_request->snap_id; } ceph_osdc_build_request(osd_req, obj_request->offset, snapc, snap_id, mtime); @@ -1576,6 +1559,8 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, resid = img_request->length; rbd_assert(resid > 0); while (resid) { + struct ceph_osd_request *osd_req; + struct ceph_osd_data *osd_data; const char *object_name; unsigned int clone_size; u64 offset; @@ -1601,14 +1586,18 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, if (!obj_request->bio_list) goto out_partial; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, - write_request, obj_request); - if (!obj_request->osd_req) + osd_req = rbd_osd_req_create(rbd_dev, write_request, + obj_request); + if (!osd_req) goto out_partial; + obj_request->osd_req = osd_req; - osd_req_op_extent_init(obj_request->osd_req, 0, - opcode, offset, length, 0, 0); - rbd_osd_req_format_op(obj_request, write_request); + osd_data = write_request ? &osd_req->r_data_out + : &osd_req->r_data_in; + osd_req_op_extent_init(osd_req, 0, opcode, offset, length, + 0, 0); + osd_req_op_extent_osd_data(osd_req, 0, osd_data); + rbd_osd_req_format(obj_request, write_request); /* status and version are initially zero-filled */ @@ -1725,7 +1714,7 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); - rbd_osd_req_format_op(obj_request, false); + rbd_osd_req_format(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; obj_request->callback = rbd_obj_request_put; @@ -1791,7 +1780,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, rbd_dev->watch_event->cookie, rbd_dev->header.obj_version, start); - rbd_osd_req_format_op(obj_request, true); + rbd_osd_req_format(obj_request, true); if (start) ceph_osdc_set_request_linger(osdc, obj_request->osd_req); @@ -1850,6 +1839,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, u64 *version) { struct rbd_obj_request *obj_request; + struct ceph_osd_data *osd_data; struct ceph_osd_client *osdc; struct page **pages; u32 page_count; @@ -1880,10 +1870,12 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; + osd_data = &obj_request->osd_req->r_data_in; osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, class_name, method_name, outbound, outbound_size); - rbd_osd_req_format_op(obj_request, false); + osd_req_op_cls_response_data(obj_request->osd_req, 0, osd_data); + rbd_osd_req_format(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; ret = rbd_obj_request_submit(osdc, obj_request); @@ -2062,6 +2054,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, { struct rbd_obj_request *obj_request; + struct ceph_osd_data *osd_data; struct ceph_osd_client *osdc; struct page **pages = NULL; u32 page_count; @@ -2086,9 +2079,11 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; + osd_data = &obj_request->osd_req->r_data_in; osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, offset, length, 0, 0); - rbd_osd_req_format_op(obj_request, false); + osd_req_op_extent_osd_data(obj_request->osd_req, 0, osd_data); + rbd_osd_req_format(obj_request, false); osdc = &rbd_dev->rbd_client->client->osdc; ret = rbd_obj_request_submit(osdc, obj_request); -- cgit v1.2.3 From 44cd188d48a95e42651c59ff552d45cc8c667f2c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: rbd: separate initialization of osd data The osd data for a request is currently initialized inside rbd_osd_req_create(), but that assumes an object request's data belongs in the osd request's data in or data out field. There are only three places where requests with data are set up, and it turns out it's easier to call just the osd data init routines directly there rather than handling it in rbd_osd_req_create(). (The real motivation here is moving toward getting rid of the osd request in and out data fields.) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 06912abca601..4cfe9f96589e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1344,8 +1344,6 @@ static struct ceph_osd_request *rbd_osd_req_create( struct ceph_snap_context *snapc = NULL; struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; - struct ceph_osd_data *osd_data; - u64 offset = obj_request->offset; if (img_request) { rbd_assert(img_request->write_request == write_request); @@ -1359,23 +1357,6 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); if (!osd_req) return NULL; /* ENOMEM */ - osd_data = write_request ? &osd_req->r_data_out : &osd_req->r_data_in; - - rbd_assert(obj_request_type_valid(obj_request->type)); - switch (obj_request->type) { - case OBJ_REQUEST_NODATA: - break; /* Nothing to do */ - case OBJ_REQUEST_BIO: - rbd_assert(obj_request->bio_list != NULL); - ceph_osd_data_bio_init(osd_data, obj_request->bio_list, - obj_request->length); - break; - case OBJ_REQUEST_PAGES: - ceph_osd_data_pages_init(osd_data, obj_request->pages, - obj_request->length, offset & ~PAGE_MASK, - false, false); - break; - } if (write_request) osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; @@ -1596,6 +1577,8 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, : &osd_req->r_data_in; osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 0, 0); + ceph_osd_data_bio_init(osd_data, obj_request->bio_list, + obj_request->length); osd_req_op_extent_osd_data(osd_req, 0, osd_data); rbd_osd_req_format(obj_request, write_request); @@ -1874,6 +1857,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, class_name, method_name, outbound, outbound_size); + ceph_osd_data_pages_init(osd_data, obj_request->pages, inbound_size, + 0, false, false); osd_req_op_cls_response_data(obj_request->osd_req, 0, osd_data); rbd_osd_req_format(obj_request, false); @@ -2082,6 +2067,10 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, osd_data = &obj_request->osd_req->r_data_in; osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, offset, length, 0, 0); + ceph_osd_data_pages_init(osd_data, obj_request->pages, + obj_request->length, + obj_request->offset & ~PAGE_MASK, + false, false); osd_req_op_extent_osd_data(obj_request->osd_req, 0, osd_data); rbd_osd_req_format(obj_request, false); -- cgit v1.2.3 From 2169238dd3a01bc06670fb9c85635cbe97338ff8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: rbd: rearrange some code for consistency This patch just trivially moves around some code for consistency. In preparation for initializing osd request data fields in ceph_osdc_build_request(), I wanted to verify that rbd did in fact call that immediately before it called ceph_osdc_start_request(). It was true (although image requests are built in a group and then started as a group). But I made the changes here just to make it more obvious, by making all of the calls follow a common sequence: osd_req_op__init(); ceph_osd_data__init() osd_req_op__() rbd_osd_req_format() ... ret = rbd_obj_request_submit() I moved the initialization of the callback for image object requests into rbd_img_request_fill_bio(), again, for consistency. To avoid a forward reference, I moved the definition of rbd_img_obj_callback() up in the file. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 128 +++++++++++++++++++++++++--------------------------- 1 file changed, 62 insertions(+), 66 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4cfe9f96589e..db29783436c8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1519,6 +1519,57 @@ static void rbd_img_request_destroy(struct kref *kref) kfree(img_request); } +static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + u32 which = obj_request->which; + bool more = true; + + img_request = obj_request->img_request; + + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); + rbd_assert(img_request != NULL); + rbd_assert(img_request->rq != NULL); + rbd_assert(img_request->obj_request_count > 0); + rbd_assert(which != BAD_WHICH); + rbd_assert(which < img_request->obj_request_count); + rbd_assert(which >= img_request->next_completion); + + spin_lock_irq(&img_request->completion_lock); + if (which != img_request->next_completion) + goto out; + + for_each_obj_request_from(img_request, obj_request) { + unsigned int xferred; + int result; + + rbd_assert(more); + rbd_assert(which < img_request->obj_request_count); + + if (!obj_request_done_test(obj_request)) + break; + + rbd_assert(obj_request->xferred <= (u64) UINT_MAX); + xferred = (unsigned int) obj_request->xferred; + result = (int) obj_request->result; + if (result) + rbd_warn(NULL, "obj_request %s result %d xferred %u\n", + img_request->write_request ? "write" : "read", + result, xferred); + + more = blk_end_request(img_request->rq, result, xferred); + which++; + } + + rbd_assert(more ^ (which == img_request->obj_request_count)); + img_request->next_completion = which; +out: + spin_unlock_irq(&img_request->completion_lock); + + if (!more) + rbd_img_request_complete(img_request); +} + static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, struct bio *bio_list) { @@ -1572,6 +1623,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, if (!osd_req) goto out_partial; obj_request->osd_req = osd_req; + obj_request->callback = rbd_img_obj_callback; osd_data = write_request ? &osd_req->r_data_out : &osd_req->r_data_in; @@ -1582,8 +1634,6 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, osd_req_op_extent_osd_data(osd_req, 0, osd_data); rbd_osd_req_format(obj_request, write_request); - /* status and version are initially zero-filled */ - rbd_img_obj_request_add(img_request, obj_request); image_offset += length; @@ -1601,57 +1651,6 @@ out_unwind: return -ENOMEM; } -static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) -{ - struct rbd_img_request *img_request; - u32 which = obj_request->which; - bool more = true; - - img_request = obj_request->img_request; - - dout("%s: img %p obj %p\n", __func__, img_request, obj_request); - rbd_assert(img_request != NULL); - rbd_assert(img_request->rq != NULL); - rbd_assert(img_request->obj_request_count > 0); - rbd_assert(which != BAD_WHICH); - rbd_assert(which < img_request->obj_request_count); - rbd_assert(which >= img_request->next_completion); - - spin_lock_irq(&img_request->completion_lock); - if (which != img_request->next_completion) - goto out; - - for_each_obj_request_from(img_request, obj_request) { - unsigned int xferred; - int result; - - rbd_assert(more); - rbd_assert(which < img_request->obj_request_count); - - if (!obj_request_done_test(obj_request)) - break; - - rbd_assert(obj_request->xferred <= (u64) UINT_MAX); - xferred = (unsigned int) obj_request->xferred; - result = (int) obj_request->result; - if (result) - rbd_warn(NULL, "obj_request %s result %d xferred %u\n", - img_request->write_request ? "write" : "read", - result, xferred); - - more = blk_end_request(img_request->rq, result, xferred); - which++; - } - - rbd_assert(more ^ (which == img_request->obj_request_count)); - img_request->next_completion = which; -out: - spin_unlock_irq(&img_request->completion_lock); - - if (!more) - rbd_img_request_complete(img_request); -} - static int rbd_img_request_submit(struct rbd_img_request *img_request) { struct rbd_device *rbd_dev = img_request->rbd_dev; @@ -1663,7 +1662,6 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) for_each_obj_request_safe(img_request, obj_request, next_obj_request) { int ret; - obj_request->callback = rbd_img_obj_callback; ret = rbd_obj_request_submit(osdc, obj_request); if (ret) return ret; @@ -1682,7 +1680,7 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { struct rbd_obj_request *obj_request; - struct ceph_osd_client *osdc; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; int ret; obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, @@ -1694,13 +1692,12 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); if (!obj_request->osd_req) goto out; + obj_request->callback = rbd_obj_request_put; osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); rbd_osd_req_format(obj_request, false); - osdc = &rbd_dev->rbd_client->client->osdc; - obj_request->callback = rbd_obj_request_put; ret = rbd_obj_request_submit(osdc, obj_request); out: if (ret) @@ -1760,16 +1757,17 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) if (!obj_request->osd_req) goto out_cancel; - osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, - rbd_dev->header.obj_version, start); - rbd_osd_req_format(obj_request, true); - if (start) ceph_osdc_set_request_linger(osdc, obj_request->osd_req); else ceph_osdc_unregister_linger_request(osdc, rbd_dev->watch_request->osd_req); + + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, + rbd_dev->header.obj_version, start); + rbd_osd_req_format(obj_request, true); + ret = rbd_obj_request_submit(osdc, obj_request); if (ret) goto out_cancel; @@ -1821,9 +1819,9 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, size_t inbound_size, u64 *version) { + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; struct ceph_osd_data *osd_data; - struct ceph_osd_client *osdc; struct page **pages; u32 page_count; int ret; @@ -1862,7 +1860,6 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, osd_req_op_cls_response_data(obj_request->osd_req, 0, osd_data); rbd_osd_req_format(obj_request, false); - osdc = &rbd_dev->rbd_client->client->osdc; ret = rbd_obj_request_submit(osdc, obj_request); if (ret) goto out; @@ -2038,9 +2035,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, char *buf, u64 *version) { + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; struct ceph_osd_data *osd_data; - struct ceph_osd_client *osdc; struct page **pages = NULL; u32 page_count; size_t size; @@ -2074,7 +2071,6 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, osd_req_op_extent_osd_data(obj_request->osd_req, 0, osd_data); rbd_osd_req_format(obj_request, false); - osdc = &rbd_dev->rbd_client->client->osdc; ret = rbd_obj_request_submit(osdc, obj_request); if (ret) goto out; -- cgit v1.2.3 From a4ce40a9a7c1053ac2a41cf64255e44e356e5522 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: libceph: combine initializing and setting osd data This ends up being a rather large patch but what it's doing is somewhat straightforward. Basically, this is replacing two calls with one. The first of the two calls is initializing a struct ceph_osd_data with data (either a page array, a page list, or a bio list); the second is setting an osd request op so it associates that data with one of the op's parameters. In place of those two will be a single function that initializes the op directly. That means we sort of fan out a set of the needed functions: - extent ops with pages data - extent ops with pagelist data - extent ops with bio list data and - class ops with page data for receiving a response We also have define another one, but it's only used internally: - class ops with pagelist data for request parameters Note that we *still* haven't gotten rid of the osd request's r_data_in and r_data_out fields. All the osd ops refer to them for their data. For now, these data fields are pointers assigned to the appropriate r_data_* field when these new functions are called. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 20 ++---- fs/ceph/addr.c | 12 ++-- fs/ceph/file.c | 3 +- include/linux/ceph/osd_client.h | 43 ++++++----- net/ceph/osd_client.c | 155 +++++++++++++++++++++++++++++++--------- 5 files changed, 161 insertions(+), 72 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index db29783436c8..6f7a52cf75c7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1592,7 +1592,6 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, rbd_assert(resid > 0); while (resid) { struct ceph_osd_request *osd_req; - struct ceph_osd_data *osd_data; const char *object_name; unsigned int clone_size; u64 offset; @@ -1625,13 +1624,10 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, obj_request->osd_req = osd_req; obj_request->callback = rbd_img_obj_callback; - osd_data = write_request ? &osd_req->r_data_out - : &osd_req->r_data_in; osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 0, 0); - ceph_osd_data_bio_init(osd_data, obj_request->bio_list, - obj_request->length); - osd_req_op_extent_osd_data(osd_req, 0, osd_data); + osd_req_op_extent_osd_data_bio(osd_req, 0, write_request, + obj_request->bio_list, obj_request->length); rbd_osd_req_format(obj_request, write_request); rbd_img_obj_request_add(img_request, obj_request); @@ -1821,7 +1817,6 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_data *osd_data; struct page **pages; u32 page_count; int ret; @@ -1851,13 +1846,12 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - osd_data = &obj_request->osd_req->r_data_in; osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, class_name, method_name, outbound, outbound_size); - ceph_osd_data_pages_init(osd_data, obj_request->pages, inbound_size, + osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, + obj_request->pages, inbound_size, 0, false, false); - osd_req_op_cls_response_data(obj_request->osd_req, 0, osd_data); rbd_osd_req_format(obj_request, false); ret = rbd_obj_request_submit(osdc, obj_request); @@ -2037,7 +2031,6 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; - struct ceph_osd_data *osd_data; struct page **pages = NULL; u32 page_count; size_t size; @@ -2061,14 +2054,13 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, if (!obj_request->osd_req) goto out; - osd_data = &obj_request->osd_req->r_data_in; osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, offset, length, 0, 0); - ceph_osd_data_pages_init(osd_data, obj_request->pages, + osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false, + obj_request->pages, obj_request->length, obj_request->offset & ~PAGE_MASK, false, false); - osd_req_op_extent_osd_data(obj_request->osd_req, 0, osd_data); rbd_osd_req_format(obj_request, false); ret = rbd_obj_request_submit(osdc, obj_request); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index cc57104a7266..27d62070a8e9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -245,7 +245,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - osd_data = &req->r_data_in; + osd_data = osd_req_op_extent_osd_data(req, 0, false); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); @@ -343,8 +343,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_in); - ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len, 0, + osd_req_op_extent_osd_data_pages(req, 0, false, pages, len, 0, false, false); req->r_callback = finish_read; req->r_inode = inode; @@ -572,7 +571,7 @@ static void writepages_finish(struct ceph_osd_request *req, long writeback_stat; unsigned issued = ceph_caps_issued(ci); - osd_data = &req->r_data_out; + osd_data = osd_req_op_extent_osd_data(req, 0, true); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); @@ -917,9 +916,8 @@ get_more_pages: dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out); - ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, - len, 0, !!pool, false); + osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, 0, + !!pool, false); pages = NULL; /* request message now owns the pages array */ pool = NULL; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index cddc10fd7cf9..0f9c4095614b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -574,8 +574,7 @@ more: own_pages = true; } } - BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out); - ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len, + osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, page_align, false, own_pages); /* BUG_ON(vino.snap != CEPH_NOSNAP); */ diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 71c41575646d..f8a00b48e550 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -240,17 +240,39 @@ extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 truncate_size, u32 truncate_seq); extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, unsigned int which, u64 length); -extern void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + +extern struct ceph_osd_data *osd_req_op_extent_osd_data( + struct ceph_osd_request *osd_req, + unsigned int which, bool write_request); +extern struct ceph_osd_data *osd_req_op_cls_response_data( + struct ceph_osd_request *osd_req, + unsigned int which); + +extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, + unsigned int which, bool write_request, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); +extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, + unsigned int which, bool write_request, + struct ceph_pagelist *pagelist); +#ifdef CONFIG_BLOCK +extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, + unsigned int which, bool write_request, + struct bio *bio, size_t bio_length); +#endif /* CONFIG_BLOCK */ + +extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, unsigned int which, - struct ceph_osd_data *osd_data); + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); + extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size); -extern void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, - unsigned int which, - struct ceph_osd_data *response_data); extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag); @@ -290,17 +312,6 @@ static inline void ceph_osdc_put_request(struct ceph_osd_request *req) kref_put(&req->r_kref, ceph_osdc_release_request); } -extern void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, - struct page **pages, u64 length, - u32 alignment, bool pages_from_pool, - bool own_pages); -extern void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, - struct ceph_pagelist *pagelist); -#ifdef CONFIG_BLOCK -extern void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, - struct bio *bio, size_t bio_length); -#endif /* CONFIG_BLOCK */ - extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, bool nofail); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 932b8af8b8ee..86cb52404f17 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1,3 +1,4 @@ + #include #include @@ -85,7 +86,7 @@ static void ceph_osd_data_init(struct ceph_osd_data *osd_data) osd_data->type = CEPH_OSD_DATA_TYPE_NONE; } -void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, +static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { @@ -96,27 +97,131 @@ void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, osd_data->pages_from_pool = pages_from_pool; osd_data->own_pages = own_pages; } -EXPORT_SYMBOL(ceph_osd_data_pages_init); -void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, +static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, struct ceph_pagelist *pagelist) { osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST; osd_data->pagelist = pagelist; } -EXPORT_SYMBOL(ceph_osd_data_pagelist_init); #ifdef CONFIG_BLOCK -void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, +static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, struct bio *bio, size_t bio_length) { osd_data->type = CEPH_OSD_DATA_TYPE_BIO; osd_data->bio = bio; osd_data->bio_length = bio_length; } -EXPORT_SYMBOL(ceph_osd_data_bio_init); #endif /* CONFIG_BLOCK */ +struct ceph_osd_data * +osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + unsigned int which, bool write_request) +{ + BUG_ON(which >= osd_req->r_num_ops); + + /* return &osd_req->r_ops[which].extent.osd_data; */ + return write_request ? &osd_req->r_data_out : &osd_req->r_data_in; +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data); + +struct ceph_osd_data * +osd_req_op_cls_request_info(struct ceph_osd_request *osd_req, + unsigned int which) +{ + BUG_ON(which >= osd_req->r_num_ops); + + /* return &osd_req->r_ops[which].cls.request_info; */ + return &osd_req->r_data_out; /* Request data is outgoing */ +} +EXPORT_SYMBOL(osd_req_op_cls_request_info); /* ??? */ + +struct ceph_osd_data * +osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + BUG_ON(which >= osd_req->r_num_ops); + + /* return &osd_req->r_ops[which].cls.response_data; */ + return &osd_req->r_data_in; /* Response data is incoming */ +} +EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ + +void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, bool write_request, + struct page **pages, u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); + + osd_req->r_ops[which].extent.osd_data = + osd_req_op_extent_osd_data(osd_req, which, write_request); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); + +void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, + unsigned int which, bool write_request, + struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + ceph_osd_data_pagelist_init(osd_data, pagelist); + + osd_req->r_ops[which].extent.osd_data = + osd_req_op_extent_osd_data(osd_req, which, write_request); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); + +#ifdef CONFIG_BLOCK +void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, + unsigned int which, bool write_request, + struct bio *bio, size_t bio_length) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + ceph_osd_data_bio_init(osd_data, bio, bio_length); + + osd_req->r_ops[which].extent.osd_data = + osd_req_op_extent_osd_data(osd_req, which, write_request); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); +#endif /* CONFIG_BLOCK */ + +static void osd_req_op_cls_request_info_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_cls_request_info(osd_req, which); + ceph_osd_data_pagelist_init(osd_data, pagelist); + + osd_req->r_ops[which].cls.request_info = + osd_req_op_cls_request_info(osd_req, which); +} + +void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_cls_response_data(osd_req, which); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); + + osd_req->r_ops[which].cls.response_data = + osd_req_op_cls_response_data(osd_req, which); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data_pages); + static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) { switch (osd_data->type) { @@ -385,15 +490,6 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_update); -void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, - unsigned int which, - struct ceph_osd_data *osd_data) -{ - BUG_ON(which >= osd_req->r_num_ops); - osd_req->r_ops[which].extent.osd_data = osd_data; -} -EXPORT_SYMBOL(osd_req_op_extent_osd_data); - void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size) @@ -429,22 +525,13 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, ceph_pagelist_append(pagelist, request_data, request_data_size); payload_len += request_data_size; - op->cls.request_info = &osd_req->r_data_out; - ceph_osd_data_pagelist_init(op->cls.request_info, pagelist); + osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); op->cls.argc = 0; /* currently unused */ op->payload_len = payload_len; } EXPORT_SYMBOL(osd_req_op_cls_init); -void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, - unsigned int which, - struct ceph_osd_data *response_data) -{ - BUG_ON(which >= osd_req->r_num_ops); - osd_req->r_ops[which].cls.response_data = response_data; -} -EXPORT_SYMBOL(osd_req_op_cls_response_data); void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, @@ -547,7 +634,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, bool use_mempool) { struct ceph_osd_request *req; - struct ceph_osd_data *osd_data; u64 objnum = 0; u64 objoff = 0; u64 objlen = 0; @@ -561,8 +647,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); - osd_data = opcode == CEPH_OSD_OP_WRITE ? &req->r_data_out - : &req->r_data_in; req->r_flags = flags; @@ -585,7 +669,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, osd_req_op_extent_init(req, 0, opcode, objoff, objlen, truncate_size, truncate_seq); - osd_req_op_extent_osd_data(req, 0, osd_data); /* * A second op in the ops array means the caller wants to @@ -2171,8 +2254,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, /* it may be a short read due to an object boundary */ - ceph_osd_data_pages_init(&req->r_data_in, pages, *plen, page_align, - false, false); + osd_req_op_extent_osd_data_pages(req, 0, false, + pages, *plen, page_align, false, false); dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", off, *plen, *plen, page_align); @@ -2214,7 +2297,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, return PTR_ERR(req); /* it may be a short write due to an object boundary */ - ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align, + osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, page_align, false, false); dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); @@ -2308,8 +2391,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - struct ceph_osd_data *osd_data = &req->r_data_in; + struct ceph_osd_data *osd_data; + /* + * XXX This is assuming there is only one op containing + * XXX page data. Probably OK for reads, but this + * XXX ought to be done more generally. + */ + osd_data = osd_req_op_extent_osd_data(req, 0, false); if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { if (osd_data->pages && unlikely(osd_data->length < data_len)) { -- cgit v1.2.3 From 04017e29bbcf0673d8a6af616c56e395d05f5971 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:02 -0500 Subject: libceph: make method call data be a separate data item Right now the data for a method call is specified via a pointer and length, and it's copied--along with the class and method name--into a pagelist data item to be sent to the osd. Instead, encode the data in a data item separate from the class and method names. This will allow large amounts of data to be supplied to methods without copying. Only rbd uses the class functionality right now, and when it really needs this it will probably need to use a page array rather than a page list. But this simple implementation demonstrates the functionality on the osd client, and that's enough for now. This resolves: http://tracker.ceph.com/issues/4104 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 15 ++++++++-- include/linux/ceph/osd_client.h | 10 +++---- net/ceph/osd_client.c | 62 +++++++++++++++++++++++++++++------------ 3 files changed, 62 insertions(+), 25 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6f7a52cf75c7..11b7987cb75f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1847,8 +1847,19 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, goto out; osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, - class_name, method_name, - outbound, outbound_size); + class_name, method_name); + if (outbound_size) { + struct ceph_pagelist *pagelist; + + pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + if (!pagelist) + goto out; + + ceph_pagelist_init(pagelist); + ceph_pagelist_append(pagelist, outbound, outbound_size); + osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, + pagelist); + } osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, obj_request->pages, inbound_size, 0, false, false); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4ec46c0ceaf7..2a68a7465c18 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -92,10 +92,9 @@ struct ceph_osd_req_op { struct { const char *class_name; const char *method_name; - const void *request_data; struct ceph_osd_data request_info; + struct ceph_osd_data request_data; struct ceph_osd_data response_data; - u32 request_data_len; __u8 class_len; __u8 method_len; __u8 argc; @@ -259,6 +258,9 @@ extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, struct bio *bio, size_t bio_length); #endif /* CONFIG_BLOCK */ +extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, + unsigned int which, + struct ceph_pagelist *pagelist); extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, unsigned int which, struct page **pages, u64 length, @@ -267,9 +269,7 @@ extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, - const char *class, const char *method, - const void *request_data, - size_t request_data_size); + const char *class, const char *method); extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 73227853d845..939be67199ca 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -135,6 +135,16 @@ osd_req_op_cls_request_info(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_cls_request_info); /* ??? */ +struct ceph_osd_data * +osd_req_op_cls_request_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + BUG_ON(which >= osd_req->r_num_ops); + + return &osd_req->r_ops[which].cls.request_data; +} +EXPORT_SYMBOL(osd_req_op_cls_request_data); /* ??? */ + struct ceph_osd_data * osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, unsigned int which) @@ -192,6 +202,17 @@ static void osd_req_op_cls_request_info_pagelist( ceph_osd_data_pagelist_init(osd_data, pagelist); } +void osd_req_op_cls_request_data_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_cls_request_data(osd_req, which); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); + void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) @@ -251,6 +272,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, break; case CEPH_OSD_OP_CALL: ceph_osd_data_release(&op->cls.request_info); + ceph_osd_data_release(&op->cls.request_data); ceph_osd_data_release(&op->cls.response_data); break; default: @@ -492,8 +514,7 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, EXPORT_SYMBOL(osd_req_op_extent_update); void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, - u16 opcode, const char *class, const char *method, - const void *request_data, size_t request_data_size) + u16 opcode, const char *class, const char *method) { struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode); struct ceph_pagelist *pagelist; @@ -520,12 +541,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, ceph_pagelist_append(pagelist, method, size); payload_len += size; - op->cls.request_data = request_data; - BUG_ON(request_data_size > (size_t) U32_MAX); - op->cls.request_data_len = (u32) request_data_size; - ceph_pagelist_append(pagelist, request_data, request_data_size); - payload_len += request_data_size; - osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); op->cls.argc = 0; /* currently unused */ @@ -576,7 +591,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, struct ceph_osd_op *dst, unsigned int which) { struct ceph_osd_req_op *src; + struct ceph_osd_data *osd_data; u64 request_data_len = 0; + u64 data_length; BUG_ON(which >= req->r_num_ops); src = &req->r_ops[which]; @@ -599,22 +616,31 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); + osd_data = &src->extent.osd_data; if (src->op == CEPH_OSD_OP_WRITE) - ceph_osdc_msg_data_add(req->r_request, - &src->extent.osd_data); + ceph_osdc_msg_data_add(req->r_request, osd_data); else - ceph_osdc_msg_data_add(req->r_reply, - &src->extent.osd_data); + ceph_osdc_msg_data_add(req->r_reply, osd_data); break; case CEPH_OSD_OP_CALL: dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; - dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len); - ceph_osdc_msg_data_add(req->r_reply, &src->cls.response_data); - ceph_osdc_msg_data_add(req->r_request, &src->cls.request_info); - BUG_ON(src->cls.request_info.type != - CEPH_OSD_DATA_TYPE_PAGELIST); - request_data_len = src->cls.request_info.pagelist->length; + osd_data = &src->cls.request_info; + ceph_osdc_msg_data_add(req->r_request, osd_data); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST); + request_data_len = osd_data->pagelist->length; + + osd_data = &src->cls.request_data; + data_length = ceph_osd_data_length(osd_data); + if (data_length) { + BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); + dst->cls.indata_len = cpu_to_le32(data_length); + ceph_osdc_msg_data_add(req->r_request, osd_data); + src->payload_len += data_length; + request_data_len += data_length; + } + osd_data = &src->cls.response_data; + ceph_osdc_msg_data_add(req->r_reply, osd_data); break; case CEPH_OSD_OP_STARTSYNC: break; -- cgit v1.2.3 From 5cbf6f12c48121199cc214c93dea98cce719343b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 11 Apr 2013 09:29:48 -0500 Subject: rbd: update feature bits There is a new rbd feature bit defined for "fancy striping." Add it to the ones defined in the kernel client. Change RBD_FEATURES_ALL so it represents the set of all feature bits (rather than just the ones we support). Define a new symbol RBD_FEATURES_SUPPORTED to indicate the supported ones. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 11b7987cb75f..503e64f51fe1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -73,11 +73,14 @@ /* Feature bits */ -#define RBD_FEATURE_LAYERING 1 +#define RBD_FEATURE_LAYERING (1<<0) +#define RBD_FEATURE_STRIPINGV2 (1<<1) +#define RBD_FEATURES_ALL \ + (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) /* Features supported by this (client software) implementation. */ -#define RBD_FEATURES_ALL (0) +#define RBD_FEATURES_SUPPORTED (0) /* * An RBD device name will be "rbd#", where the "rbd" comes from @@ -2843,7 +2846,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, return ret; incompat = le64_to_cpu(features_buf.incompat); - if (incompat & ~RBD_FEATURES_ALL) + if (incompat & ~RBD_FEATURES_SUPPORTED) return -ENXIO; *snap_features = le64_to_cpu(features_buf.features); -- cgit v1.2.3 From a5a337d4382dfe0f9e9e072e7d3eaad8e05e4b0b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 24 Jan 2013 16:13:36 -0600 Subject: rbd: record overall image request result If any image object request produces a non-zero result, preserve that as the result of the overall image request. If multiple objects have non-zero results, save only the first one. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 503e64f51fe1..69eab66b6c67 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -214,6 +214,7 @@ struct rbd_img_request { spinlock_t completion_lock;/* protects next_completion */ u32 next_completion; rbd_img_callback_t callback; + int result; /* first nonzero obj_request result */ u32 obj_request_count; struct list_head obj_requests; /* rbd_obj_request structs */ @@ -1488,6 +1489,7 @@ static struct rbd_img_request *rbd_img_request_create( spin_lock_init(&img_request->completion_lock); img_request->next_completion = 0; img_request->callback = NULL; + img_request->result = 0; img_request->obj_request_count = 0; INIT_LIST_HEAD(&img_request->obj_requests); kref_init(&img_request->kref); @@ -1552,13 +1554,16 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) if (!obj_request_done_test(obj_request)) break; - rbd_assert(obj_request->xferred <= (u64) UINT_MAX); - xferred = (unsigned int) obj_request->xferred; - result = (int) obj_request->result; - if (result) + rbd_assert(obj_request->xferred <= (u64)UINT_MAX); + xferred = (unsigned int)obj_request->xferred; + result = obj_request->result; + if (result) { rbd_warn(NULL, "obj_request %s result %d xferred %u\n", img_request->write_request ? "write" : "read", result, xferred); + if (!img_request->result) + img_request->result = result; + } more = blk_end_request(img_request->rq, result, xferred); which++; -- cgit v1.2.3 From 55f27e09312310d4dea9bb7b80c696f407caf1be Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 10 Apr 2013 12:34:25 -0500 Subject: rbd: record aggregate image transfer count Compute the total number of bytes transferred for an image request--the sum across each of the request's object requests. To avoid contention do it only when all object requests are complete, in rbd_img_request_complete(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 69eab66b6c67..e8374aec93da 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -214,6 +214,7 @@ struct rbd_img_request { spinlock_t completion_lock;/* protects next_completion */ u32 next_completion; rbd_img_callback_t callback; + u64 xferred;/* aggregate bytes transferred */ int result; /* first nonzero obj_request result */ u32 obj_request_count; @@ -1148,7 +1149,24 @@ static int rbd_obj_request_submit(struct ceph_osd_client *osdc, static void rbd_img_request_complete(struct rbd_img_request *img_request) { + dout("%s: img %p\n", __func__, img_request); + + /* + * If no error occurred, compute the aggregate transfer + * count for the image request. We could instead use + * atomic64_cmpxchg() to update it as each object request + * completes; not clear which way is better off hand. + */ + if (!img_request->result) { + struct rbd_obj_request *obj_request; + u64 xferred = 0; + + for_each_obj_request(img_request, obj_request) + xferred += obj_request->xferred; + img_request->xferred = xferred; + } + if (img_request->callback) img_request->callback(img_request); else -- cgit v1.2.3 From 7da22d296d871174f3e8251a02a8f86a90c7463b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 24 Jan 2013 16:13:36 -0600 Subject: rbd: record image-relative offset in object requests For an image object request we will need to know what offset within the rbd image the request covers. Record that when the object request gets created. Update the I/O error warnings so they use this so what's reported is more informative. Rename a local variable to fit the convention used everywhere else. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e8374aec93da..f0124c5fbe4b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -176,6 +176,7 @@ struct rbd_obj_request { u64 length; /* bytes from offset */ struct rbd_img_request *img_request; + u64 img_offset; /* image relative offset */ struct list_head links; /* img_request->obj_requests */ u32 which; /* posn image request list */ @@ -1576,8 +1577,13 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) xferred = (unsigned int)obj_request->xferred; result = obj_request->result; if (result) { - rbd_warn(NULL, "obj_request %s result %d xferred %u\n", + struct rbd_device *rbd_dev = img_request->rbd_dev; + + rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", img_request->write_request ? "write" : "read", + obj_request->length, obj_request->img_offset, + obj_request->offset); + rbd_warn(rbd_dev, " result %d xferred %x\n", result, xferred); if (!img_request->result) img_request->result = result; @@ -1604,7 +1610,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, struct rbd_obj_request *next_obj_request; bool write_request = img_request->write_request; unsigned int bio_offset; - u64 image_offset; + u64 img_offset; u64 resid; u16 opcode; @@ -1612,8 +1618,8 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; bio_offset = 0; - image_offset = img_request->offset; - rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT); + img_offset = img_request->offset; + rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); resid = img_request->length; rbd_assert(resid > 0); while (resid) { @@ -1623,11 +1629,11 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, u64 offset; u64 length; - object_name = rbd_segment_name(rbd_dev, image_offset); + object_name = rbd_segment_name(rbd_dev, img_offset); if (!object_name) goto out_unwind; - offset = rbd_segment_offset(rbd_dev, image_offset); - length = rbd_segment_length(rbd_dev, image_offset, resid); + offset = rbd_segment_offset(rbd_dev, img_offset); + length = rbd_segment_length(rbd_dev, img_offset, resid); obj_request = rbd_obj_request_create(object_name, offset, length, OBJ_REQUEST_BIO); @@ -1656,9 +1662,10 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, obj_request->bio_list, obj_request->length); rbd_osd_req_format(obj_request, write_request); + obj_request->img_offset = img_offset; rbd_img_obj_request_add(img_request, obj_request); - image_offset += length; + img_offset += length; resid -= length; } @@ -1993,8 +2000,10 @@ static void rbd_request_fn(struct request_queue *q) end_request: spin_lock_irq(q->queue_lock); if (result < 0) { - rbd_warn(rbd_dev, "obj_request %s result %d\n", - write_request ? "write" : "read", result); + rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", + write_request ? "write" : "read", + length, offset, result); + __blk_end_request_all(rq, result); } } -- cgit v1.2.3 From 0c425248e0c6b3ebb64489b178b5412ab164b7f8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Feb 2013 09:55:49 -0600 Subject: rbd: define image request flags There are several Boolean values we'll be maintaining for image requests. Switch from the single write_request field to a general-purpose flags field, and use one if its bits to represent the direction of I/O for the image request. Define helper functions for setting and testing that flag. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f0124c5fbe4b..5ea2e36926a8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -202,12 +202,16 @@ struct rbd_obj_request { struct kref kref; }; +enum img_req_flags { + IMG_REQ_WRITE, /* read = 0, write = 1 */ +}; + struct rbd_img_request { struct request *rq; struct rbd_device *rbd_dev; u64 offset; /* starting image byte offset */ u64 length; /* byte count from offset */ - bool write_request; /* false for read */ + unsigned long flags; union { struct ceph_snap_context *snapc; /* for writes */ u64 snap_id; /* for reads */ @@ -1210,6 +1214,23 @@ static bool obj_request_done_test(struct rbd_obj_request *obj_request) return atomic_read(&obj_request->done) != 0; } +/* + * The default/initial value for all image request flags is 0. Each + * is conditionally set to 1 at image request initialization time + * and currently never change thereafter. + */ +static void img_request_write_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_WRITE, &img_request->flags); + smp_mb(); +} + +static bool img_request_write_test(struct rbd_img_request *img_request) +{ + smp_mb(); + return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; +} + static void rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) { @@ -1369,8 +1390,9 @@ static struct ceph_osd_request *rbd_osd_req_create( struct ceph_osd_request *osd_req; if (img_request) { - rbd_assert(img_request->write_request == write_request); - if (img_request->write_request) + rbd_assert(write_request == + img_request_write_test(img_request)); + if (write_request) snapc = img_request->snapc; } @@ -1494,17 +1516,20 @@ static struct rbd_img_request *rbd_img_request_create( kfree(img_request); return NULL; /* Shouldn't happen */ } + } img_request->rq = NULL; img_request->rbd_dev = rbd_dev; img_request->offset = offset; img_request->length = length; - img_request->write_request = write_request; - if (write_request) + img_request->flags = 0; + if (write_request) { + img_request_write_set(img_request); img_request->snapc = snapc; - else + } else { img_request->snap_id = rbd_dev->spec->snap_id; + } spin_lock_init(&img_request->completion_lock); img_request->next_completion = 0; img_request->callback = NULL; @@ -1537,7 +1562,7 @@ static void rbd_img_request_destroy(struct kref *kref) rbd_img_obj_request_del(img_request, obj_request); rbd_assert(img_request->obj_request_count == 0); - if (img_request->write_request) + if (img_request_write_test(img_request)) ceph_put_snap_context(img_request->snapc); kfree(img_request); @@ -1580,7 +1605,8 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) struct rbd_device *rbd_dev = img_request->rbd_dev; rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", - img_request->write_request ? "write" : "read", + img_request_write_test(img_request) ? "write" + : "read", obj_request->length, obj_request->img_offset, obj_request->offset); rbd_warn(rbd_dev, " result %d xferred %x\n", @@ -1608,7 +1634,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, struct rbd_device *rbd_dev = img_request->rbd_dev; struct rbd_obj_request *obj_request = NULL; struct rbd_obj_request *next_obj_request; - bool write_request = img_request->write_request; + bool write_request = img_request_write_test(img_request); unsigned int bio_offset; u64 img_offset; u64 resid; -- cgit v1.2.3 From 9849e986367ef95bac92609bba0349669ed87b53 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 24 Jan 2013 16:13:36 -0600 Subject: rbd: define image request originator flag Define a flag indicating whether an image request originated from the Linux block layer (from blk_fetch_request()) or whether it was initiated in order to satisfy an object request for a child image of a layered rbd device. For image requests initiated by objects of child images we'll save a pointer to the object request rather than the Linux block request. For now, only block requests are used. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5ea2e36926a8..7ecd9099ea89 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -203,18 +203,22 @@ struct rbd_obj_request { }; enum img_req_flags { - IMG_REQ_WRITE, /* read = 0, write = 1 */ + IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ + IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ }; struct rbd_img_request { - struct request *rq; struct rbd_device *rbd_dev; u64 offset; /* starting image byte offset */ u64 length; /* byte count from offset */ unsigned long flags; union { + u64 snap_id; /* for reads */ struct ceph_snap_context *snapc; /* for writes */ - u64 snap_id; /* for reads */ + }; + union { + struct request *rq; /* block request */ + struct rbd_obj_request *obj_request; /* obj req initiator */ }; spinlock_t completion_lock;/* protects next_completion */ u32 next_completion; @@ -1231,6 +1235,18 @@ static bool img_request_write_test(struct rbd_img_request *img_request) return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; } +static void img_request_child_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_CHILD, &img_request->flags); + smp_mb(); +} + +static bool img_request_child_test(struct rbd_img_request *img_request) +{ + smp_mb(); + return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; +} + static void rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) { @@ -1499,7 +1515,8 @@ static void rbd_obj_request_destroy(struct kref *kref) static struct rbd_img_request *rbd_img_request_create( struct rbd_device *rbd_dev, u64 offset, u64 length, - bool write_request) + bool write_request, + bool child_request) { struct rbd_img_request *img_request; struct ceph_snap_context *snapc = NULL; @@ -1530,6 +1547,8 @@ static struct rbd_img_request *rbd_img_request_create( } else { img_request->snap_id = rbd_dev->spec->snap_id; } + if (child_request) + img_request_child_set(img_request); spin_lock_init(&img_request->completion_lock); img_request->next_completion = 0; img_request->callback = NULL; @@ -1578,7 +1597,9 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) dout("%s: img %p obj %p\n", __func__, img_request, obj_request); rbd_assert(img_request != NULL); + rbd_assert(!img_request_child_test(img_request)) rbd_assert(img_request->rq != NULL); + rbd_assert(img_request->obj_request_count > 0); rbd_assert(which != BAD_WHICH); rbd_assert(which < img_request->obj_request_count); @@ -2012,7 +2033,7 @@ static void rbd_request_fn(struct request_queue *q) result = -ENOMEM; img_request = rbd_img_request_create(rbd_dev, offset, length, - write_request); + write_request, false); if (!img_request) goto end_request; -- cgit v1.2.3 From d0b2e944555d1f06cf6df8a37b76367d10b05b01 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 24 Jan 2013 16:13:36 -0600 Subject: rbd: define image request layered flag Define a flag indicating whether an image request is for a layered image (one with a parent image to which requests will be redirected if the target object of a request does not exist). The code that checks this flag will be added shortly. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7ecd9099ea89..a77157d87915 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -205,6 +205,7 @@ struct rbd_obj_request { enum img_req_flags { IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ + IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ }; struct rbd_img_request { @@ -1247,6 +1248,18 @@ static bool img_request_child_test(struct rbd_img_request *img_request) return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; } +static void img_request_layered_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_LAYERED, &img_request->flags); + smp_mb(); +} + +static bool img_request_layered_test(struct rbd_img_request *img_request) +{ + smp_mb(); + return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; +} + static void rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) { @@ -1549,6 +1562,8 @@ static struct rbd_img_request *rbd_img_request_create( } if (child_request) img_request_child_set(img_request); + if (rbd_dev->parent_spec) + img_request_layered_set(img_request); spin_lock_init(&img_request->completion_lock); img_request->next_completion = 0; img_request->callback = NULL; @@ -1557,6 +1572,7 @@ static struct rbd_img_request *rbd_img_request_create( INIT_LIST_HEAD(&img_request->obj_requests); kref_init(&img_request->kref); + (void) img_request_layered_test(img_request); /* Avoid a warning */ rbd_img_request_get(img_request); /* Avoid a warning */ rbd_img_request_put(img_request); /* TEMPORARY */ -- cgit v1.2.3 From 1217857fbf0fe6245aa0ce775480a759a0bbadeb Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Feb 2013 09:55:49 -0600 Subject: rbd: encapsulate image object end request handling Encapsulate the code that completes processing of an object request that's part of an image request. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 54 ++++++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 25 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a77157d87915..2d2711537537 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1603,6 +1603,34 @@ static void rbd_img_request_destroy(struct kref *kref) kfree(img_request); } +static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = obj_request->img_request; + unsigned int xferred; + int result; + + rbd_assert(!img_request_child_test(img_request)); + rbd_assert(img_request->rq != NULL); + + rbd_assert(obj_request->xferred <= (u64)UINT_MAX); + xferred = (unsigned int)obj_request->xferred; + result = obj_request->result; + if (result) { + struct rbd_device *rbd_dev = img_request->rbd_dev; + + rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", + img_request_write_test(img_request) ? "write" : "read", + obj_request->length, obj_request->img_offset, + obj_request->offset); + rbd_warn(rbd_dev, " result %d xferred %x\n", + result, xferred); + if (!img_request->result) + img_request->result = result; + } + + return blk_end_request(img_request->rq, result, xferred); +} + static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; @@ -1613,9 +1641,6 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) dout("%s: img %p obj %p\n", __func__, img_request, obj_request); rbd_assert(img_request != NULL); - rbd_assert(!img_request_child_test(img_request)) - rbd_assert(img_request->rq != NULL); - rbd_assert(img_request->obj_request_count > 0); rbd_assert(which != BAD_WHICH); rbd_assert(which < img_request->obj_request_count); @@ -1626,33 +1651,12 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) goto out; for_each_obj_request_from(img_request, obj_request) { - unsigned int xferred; - int result; - rbd_assert(more); rbd_assert(which < img_request->obj_request_count); if (!obj_request_done_test(obj_request)) break; - - rbd_assert(obj_request->xferred <= (u64)UINT_MAX); - xferred = (unsigned int)obj_request->xferred; - result = obj_request->result; - if (result) { - struct rbd_device *rbd_dev = img_request->rbd_dev; - - rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", - img_request_write_test(img_request) ? "write" - : "read", - obj_request->length, obj_request->img_offset, - obj_request->offset); - rbd_warn(rbd_dev, " result %d xferred %x\n", - result, xferred); - if (!img_request->result) - img_request->result = result; - } - - more = blk_end_request(img_request->rq, result, xferred); + more = rbd_img_obj_end_request(obj_request); which++; } -- cgit v1.2.3 From 926f9b3f085cec8be0cbf4dcc66c28b5ac49cc14 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Feb 2013 12:33:24 -0600 Subject: rbd: define an rbd object request flags field We're going to need some more Boolean values for object requests, so create a flags bit field and use it to record whether the request is done. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 58 ++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2d2711537537..f7046e976bb0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -170,10 +170,15 @@ enum obj_request_type { OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES }; +enum obj_req_flags { + OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ +}; + struct rbd_obj_request { const char *object_name; u64 offset; /* object start byte */ u64 length; /* bytes from offset */ + unsigned long flags; struct rbd_img_request *img_request; u64 img_offset; /* image relative offset */ @@ -194,7 +199,6 @@ struct rbd_obj_request { u64 xferred; /* bytes transferred */ u64 version; int result; - atomic_t done; rbd_obj_callback_t callback; struct completion completion; @@ -1072,6 +1076,29 @@ out_err: return NULL; } +/* + * The default/initial value for all object request flags is 0. For + * each flag, once its value is set to 1 it is never reset to 0 + * again. + */ +static void obj_request_done_set(struct rbd_obj_request *obj_request) +{ + if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { + struct rbd_img_request *img_request = obj_request->img_request; + struct rbd_device *rbd_dev; + + rbd_dev = img_request ? img_request->rbd_dev : NULL; + rbd_warn(rbd_dev, "obj_request %p already marked done\n", + obj_request); + } +} + +static bool obj_request_done_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; +} + static void rbd_obj_request_get(struct rbd_obj_request *obj_request) { dout("%s: obj %p (was %d)\n", __func__, obj_request, @@ -1192,33 +1219,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) return wait_for_completion_interruptible(&obj_request->completion); } -static void obj_request_done_init(struct rbd_obj_request *obj_request) -{ - atomic_set(&obj_request->done, 0); - smp_wmb(); -} - -static void obj_request_done_set(struct rbd_obj_request *obj_request) -{ - int done; - - done = atomic_inc_return(&obj_request->done); - if (done > 1) { - struct rbd_img_request *img_request = obj_request->img_request; - struct rbd_device *rbd_dev; - - rbd_dev = img_request ? img_request->rbd_dev : NULL; - rbd_warn(rbd_dev, "obj_request %p was already done\n", - obj_request); - } -} - -static bool obj_request_done_test(struct rbd_obj_request *obj_request) -{ - smp_mb(); - return atomic_read(&obj_request->done) != 0; -} - /* * The default/initial value for all image request flags is 0. Each * is conditionally set to 1 at image request initialization time @@ -1475,10 +1475,10 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, obj_request->object_name = memcpy(name, object_name, size); obj_request->offset = offset; obj_request->length = length; + obj_request->flags = 0; obj_request->which = BAD_WHICH; obj_request->type = type; INIT_LIST_HEAD(&obj_request->links); - obj_request_done_init(obj_request); init_completion(&obj_request->completion); kref_init(&obj_request->kref); -- cgit v1.2.3 From 6365d33a275b392d3b224808490cd6172123969e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Feb 2013 12:33:24 -0600 Subject: rbd: add an object request flag for image data objects Add a flag to distinguish between object requests being done on standalone objects and requests being sent for objects representing rbd image data (i.e., object requests that are the result of image request). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f7046e976bb0..3f162e216194 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -172,6 +172,7 @@ enum obj_request_type { enum obj_req_flags { OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ + OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ }; struct rbd_obj_request { @@ -1099,6 +1100,24 @@ static bool obj_request_done_test(struct rbd_obj_request *obj_request) return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; } +static void obj_request_img_data_set(struct rbd_obj_request *obj_request) +{ + if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { + struct rbd_img_request *img_request = obj_request->img_request; + struct rbd_device *rbd_dev; + + rbd_dev = img_request ? img_request->rbd_dev : NULL; + rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", + obj_request); + } +} + +static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; +} + static void rbd_obj_request_get(struct rbd_obj_request *obj_request) { dout("%s: obj %p (was %d)\n", __func__, obj_request, @@ -1139,6 +1158,8 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, rbd_obj_request_get(obj_request); obj_request->img_request = img_request; obj_request->which = img_request->obj_request_count; + rbd_assert(!obj_request_img_data_test(obj_request)); + obj_request_img_data_set(obj_request); rbd_assert(obj_request->which != BAD_WHICH); img_request->obj_request_count++; list_add_tail(&obj_request->links, &img_request->obj_requests); @@ -1158,6 +1179,7 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, img_request->obj_request_count--; rbd_assert(obj_request->which == img_request->obj_request_count); obj_request->which = BAD_WHICH; + rbd_assert(obj_request_img_data_test(obj_request)); rbd_assert(obj_request->img_request == img_request); obj_request->img_request = NULL; obj_request->callback = NULL; @@ -1343,7 +1365,9 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); rbd_assert(osd_req == obj_request->osd_req); - rbd_assert(!!obj_request->img_request ^ + rbd_assert(obj_request_img_data_test(obj_request) ^ + !obj_request->img_request); + rbd_assert(obj_request_img_data_test(obj_request) ^ (obj_request->which == BAD_WHICH)); if (osd_req->r_result < 0) @@ -1413,12 +1437,13 @@ static struct ceph_osd_request *rbd_osd_req_create( bool write_request, struct rbd_obj_request *obj_request) { - struct rbd_img_request *img_request = obj_request->img_request; struct ceph_snap_context *snapc = NULL; struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; - if (img_request) { + if (obj_request_img_data_test(obj_request)) { + struct rbd_img_request *img_request = obj_request->img_request; + rbd_assert(write_request == img_request_write_test(img_request)); if (write_request) @@ -1605,10 +1630,13 @@ static void rbd_img_request_destroy(struct kref *kref) static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) { - struct rbd_img_request *img_request = obj_request->img_request; + struct rbd_img_request *img_request; unsigned int xferred; int result; + rbd_assert(obj_request_img_data_test(obj_request)); + img_request = obj_request->img_request; + rbd_assert(!img_request_child_test(img_request)); rbd_assert(img_request->rq != NULL); @@ -1637,6 +1665,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) u32 which = obj_request->which; bool more = true; + rbd_assert(obj_request_img_data_test(obj_request)); img_request = obj_request->img_request; dout("%s: img %p obj %p\n", __func__, img_request, obj_request); -- cgit v1.2.3 From 2f82ee54d95c9430838e4580f3bcc196ad36e4f2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Oct 2012 19:40:33 -0500 Subject: rbd: probe the parent of an image if present Call the probe function for the parent device if one is present. Since we don't formally support the layering feature we won't be using this functionality just yet. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3f162e216194..5c129c54279c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -289,6 +289,7 @@ struct rbd_device { struct rbd_spec *parent_spec; u64 parent_overlap; + struct rbd_device *parent; /* protects updating the header */ struct rw_semaphore header_rwsem; @@ -335,6 +336,7 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); +static int rbd_dev_probe(struct rbd_device *rbd_dev); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -497,6 +499,13 @@ out_opt: return ERR_PTR(ret); } +static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) +{ + kref_get(&rbdc->kref); + + return rbdc; +} + /* * Find a ceph client with specific addr and configuration. If * found, bump its reference count. @@ -512,7 +521,8 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) spin_lock(&rbd_client_list_lock); list_for_each_entry(client_node, &rbd_client_list, node) { if (!ceph_compare_options(ceph_opts, client_node->client)) { - kref_get(&client_node->kref); + __rbd_get_client(client_node); + found = true; break; } @@ -2741,8 +2751,6 @@ static struct rbd_spec *rbd_spec_alloc(void) return NULL; kref_init(&spec->kref); - rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */ - return spec; } @@ -3837,6 +3845,11 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) void *response; void *p; + /* If we already have it we don't need to look it up */ + + if (rbd_dev->spec->image_id) + return 0; + /* * When probing a parent image, the image id is already * known (and the image name likely is not). There's no @@ -4014,6 +4027,9 @@ out_err: static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) { + struct rbd_device *parent = NULL; + struct rbd_spec *parent_spec = NULL; + struct rbd_client *rbdc = NULL; int ret; /* no need to lock here, as rbd_dev is not registered yet */ @@ -4058,6 +4074,31 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) * At this point cleanup in the event of an error is the job * of the sysfs code (initiated by rbd_bus_del_dev()). */ + /* Probe the parent if there is one */ + + if (rbd_dev->parent_spec) { + /* + * We need to pass a reference to the client and the + * parent spec when creating the parent rbd_dev. + * Images related by parent/child relationships + * always share both. + */ + parent_spec = rbd_spec_get(rbd_dev->parent_spec); + rbdc = __rbd_get_client(rbd_dev->rbd_client); + + parent = rbd_dev_create(rbdc, parent_spec); + if (!parent) { + ret = -ENOMEM; + goto err_out_spec; + } + rbdc = NULL; /* parent now owns reference */ + parent_spec = NULL; /* parent now owns reference */ + ret = rbd_dev_probe(parent); + if (ret < 0) + goto err_out_parent; + rbd_dev->parent = parent; + } + down_write(&rbd_dev->header_rwsem); ret = rbd_dev_snaps_register(rbd_dev); up_write(&rbd_dev->header_rwsem); @@ -4076,6 +4117,12 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) (unsigned long long) rbd_dev->mapping.size); return ret; + +err_out_parent: + rbd_dev_destroy(parent); +err_out_spec: + rbd_spec_put(parent_spec); + rbd_put_client(rbdc); err_out_bus: /* this will also clean up rest of rbd_dev stuff */ @@ -4239,6 +4286,12 @@ static void rbd_dev_release(struct device *dev) module_put(THIS_MODULE); } +static void __rbd_remove(struct rbd_device *rbd_dev) +{ + rbd_remove_all_snaps(rbd_dev); + rbd_bus_del_dev(rbd_dev); +} + static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count) @@ -4274,8 +4327,26 @@ static ssize_t rbd_remove(struct bus_type *bus, if (ret < 0) goto done; - rbd_remove_all_snaps(rbd_dev); - rbd_bus_del_dev(rbd_dev); + while (rbd_dev->parent_spec) { + struct rbd_device *first = rbd_dev; + struct rbd_device *second = first->parent; + struct rbd_device *third; + + /* + * Follow to the parent with no grandparent and + * remove it. + */ + while (second && (third = second->parent)) { + first = second; + second = third; + } + __rbd_remove(second); + rbd_spec_put(first->parent_spec); + first->parent_spec = NULL; + first->parent_overlap = 0; + first->parent = NULL; + } + __rbd_remove(rbd_dev); done: mutex_unlock(&ctl_mutex); -- cgit v1.2.3 From 8b3e1a56982d0eafff0afb0ff9e87c8b944a9bdc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 24 Jan 2013 16:13:36 -0600 Subject: rbd: implement layered reads Implement layered read requests for format 2 rbd images. If an rbd image is a clone of a snapshot, the snapshot will be the clone's "parent" image. When an object read request on a clone comes back with ENOENT it indicates that the clone is not yet populated with that portion of the image's data, and the parent image should be consulted to satisfy the read. When this occurs, a new image request is created, directed to the parent image. The offset and length of the image are the same as the image-relative offset and length of the object request that produced ENOENT. Data from the parent image therefore satisfies the object read request for the original image request. While this code works, it will not be active until we enable the layering feature (by adding RBD_FEATURE_LAYERING to the value of RBD_FEATURES_SUPPORTED). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 85 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5c129c54279c..13a381b2a779 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -398,6 +398,8 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) # define rbd_assert(expr) ((void) 0) #endif /* !RBD_DEBUG */ +static void rbd_img_parent_read(struct rbd_obj_request *obj_request); + static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); @@ -1336,9 +1338,15 @@ static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) { - dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, - obj_request->result, obj_request->xferred, obj_request->length); - if (obj_request->img_request) + struct rbd_img_request *img_request = obj_request->img_request; + bool layered = img_request && img_request_layered_test(img_request); + + dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, + obj_request, img_request, obj_request->result, + obj_request->xferred, obj_request->length); + if (layered && obj_request->result == -ENOENT) + rbd_img_parent_read(obj_request); + else if (img_request) rbd_img_obj_request_read_callback(obj_request); else obj_request_done_set(obj_request); @@ -1349,9 +1357,8 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) dout("%s: obj %p result %d %llu\n", __func__, obj_request, obj_request->result, obj_request->length); /* - * There is no such thing as a successful short write. - * Our xferred value is the number of bytes transferred - * back. Set it to our originally-requested length. + * There is no such thing as a successful short write. Set + * it to our originally-requested length. */ obj_request->xferred = obj_request->length; obj_request_done_set(obj_request); @@ -1391,7 +1398,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, * passed to blk_end_request(), which takes an unsigned int. */ obj_request->xferred = osd_req->r_reply_op_len[0]; - rbd_assert(obj_request->xferred < (u64) UINT_MAX); + rbd_assert(obj_request->xferred < (u64)UINT_MAX); opcode = osd_req->r_ops[0].op; switch (opcode) { case CEPH_OSD_OP_READ: @@ -1607,7 +1614,6 @@ static struct rbd_img_request *rbd_img_request_create( INIT_LIST_HEAD(&img_request->obj_requests); kref_init(&img_request->kref); - (void) img_request_layered_test(img_request); /* Avoid a warning */ rbd_img_request_get(img_request); /* Avoid a warning */ rbd_img_request_put(img_request); /* TEMPORARY */ @@ -1635,6 +1641,9 @@ static void rbd_img_request_destroy(struct kref *kref) if (img_request_write_test(img_request)) ceph_put_snap_context(img_request->snapc); + if (img_request_child_test(img_request)) + rbd_obj_request_put(img_request->obj_request); + kfree(img_request); } @@ -1643,13 +1652,11 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) struct rbd_img_request *img_request; unsigned int xferred; int result; + bool more; rbd_assert(obj_request_img_data_test(obj_request)); img_request = obj_request->img_request; - rbd_assert(!img_request_child_test(img_request)); - rbd_assert(img_request->rq != NULL); - rbd_assert(obj_request->xferred <= (u64)UINT_MAX); xferred = (unsigned int)obj_request->xferred; result = obj_request->result; @@ -1666,7 +1673,15 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) img_request->result = result; } - return blk_end_request(img_request->rq, result, xferred); + if (img_request_child_test(img_request)) { + rbd_assert(img_request->obj_request != NULL); + more = obj_request->which < img_request->obj_request_count - 1; + } else { + rbd_assert(img_request->rq != NULL); + more = blk_end_request(img_request->rq, result, xferred); + } + + return more; } static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) @@ -1811,6 +1826,64 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) return 0; } +static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) +{ + struct rbd_obj_request *obj_request; + + rbd_assert(img_request_child_test(img_request)); + + obj_request = img_request->obj_request; + rbd_assert(obj_request != NULL); + obj_request->result = img_request->result; + obj_request->xferred = img_request->xferred; + + rbd_img_obj_request_read_callback(obj_request); + rbd_obj_request_complete(obj_request); +} + +static void rbd_img_parent_read(struct rbd_obj_request *obj_request) +{ + struct rbd_device *rbd_dev; + struct rbd_img_request *img_request; + int result; + + rbd_assert(obj_request_img_data_test(obj_request)); + rbd_assert(obj_request->img_request != NULL); + rbd_assert(obj_request->result == (s32) -ENOENT); + rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + + rbd_dev = obj_request->img_request->rbd_dev; + rbd_assert(rbd_dev->parent != NULL); + /* rbd_read_finish(obj_request, obj_request->length); */ + img_request = rbd_img_request_create(rbd_dev->parent, + obj_request->img_offset, + obj_request->length, + false, true); + result = -ENOMEM; + if (!img_request) + goto out_err; + + rbd_obj_request_get(obj_request); + img_request->obj_request = obj_request; + + result = rbd_img_request_fill_bio(img_request, obj_request->bio_list); + if (result) + goto out_err; + + img_request->callback = rbd_img_parent_read_callback; + result = rbd_img_request_submit(img_request); + if (result) + goto out_err; + + return; +out_err: + if (img_request) + rbd_img_request_put(img_request); + obj_request->result = result; + obj_request->xferred = 0; + obj_request_done_set(obj_request); +} + static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { -- cgit v1.2.3 From 406e2c9f9286fc93ae2191a7abf477dea05aadc9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 15 Apr 2013 14:50:36 -0500 Subject: libceph: kill off osd data write_request parameters In the incremental move toward supporting distinct data items in an osd request some of the functions had "write_request" parameters to indicate, basically, whether the data belonged to in_data or the out_data. Now that we maintain the data fields in the op structure there is no need to indicate the direction, so get rid of the "write_request" parameters. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 ++-- fs/ceph/addr.c | 9 ++++----- fs/ceph/file.c | 4 ++-- include/linux/ceph/osd_client.h | 8 ++++---- net/ceph/osd_client.c | 25 +++++++++++-------------- 5 files changed, 23 insertions(+), 27 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 13a381b2a779..8e8b876e83c3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1779,7 +1779,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 0, 0); - osd_req_op_extent_osd_data_bio(osd_req, 0, write_request, + osd_req_op_extent_osd_data_bio(osd_req, 0, obj_request->bio_list, obj_request->length); rbd_osd_req_format(obj_request, write_request); @@ -2281,7 +2281,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, offset, length, 0, 0); - osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false, + osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, obj_request->pages, obj_request->length, obj_request->offset & ~PAGE_MASK, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 2d6466b5fe82..3e68ac101040 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -245,7 +245,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - osd_data = osd_req_op_extent_osd_data(req, 0, false); + osd_data = osd_req_op_extent_osd_data(req, 0); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); @@ -343,8 +343,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - osd_req_op_extent_osd_data_pages(req, 0, false, pages, len, 0, - false, false); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); req->r_callback = finish_read; req->r_inode = inode; @@ -571,7 +570,7 @@ static void writepages_finish(struct ceph_osd_request *req, long writeback_stat; unsigned issued = ceph_caps_issued(ci); - osd_data = osd_req_op_extent_osd_data(req, 0, true); + osd_data = osd_req_op_extent_osd_data(req, 0); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); @@ -916,7 +915,7 @@ get_more_pages: dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, 0, + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, !!pool, false); pages = NULL; /* request message now owns the pages array */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7e94dcb66d92..d70830c66833 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -585,8 +585,8 @@ more: own_pages = true; } } - osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, - page_align, false, own_pages); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, own_pages); /* BUG_ON(vino.snap != CEPH_NOSNAP); */ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0d3358ef5285..0e406934a551 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -241,22 +241,22 @@ extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, extern struct ceph_osd_data *osd_req_op_extent_osd_data( struct ceph_osd_request *osd_req, - unsigned int which, bool write_request); + unsigned int which); extern struct ceph_osd_data *osd_req_op_cls_response_data( struct ceph_osd_request *osd_req, unsigned int which); extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, - unsigned int which, bool write_request, + unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages); extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, - unsigned int which, bool write_request, + unsigned int which, struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, - unsigned int which, bool write_request, + unsigned int which, struct bio *bio, size_t bio_length); #endif /* CONFIG_BLOCK */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0c5bf2fb5075..409c443c8d1f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -117,7 +117,7 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, struct ceph_osd_data * osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, - unsigned int which, bool write_request) + unsigned int which) { BUG_ON(which >= osd_req->r_num_ops); @@ -156,37 +156,34 @@ osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, - unsigned int which, bool write_request, - struct page **pages, u64 length, u32 alignment, + unsigned int which, struct page **pages, + u64 length, u32 alignment, bool pages_from_pool, bool own_pages) { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + osd_data = osd_req_op_extent_osd_data(osd_req, which); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, - unsigned int which, bool write_request, - struct ceph_pagelist *pagelist) + unsigned int which, struct ceph_pagelist *pagelist) { struct ceph_osd_data *osd_data; - osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + osd_data = osd_req_op_extent_osd_data(osd_req, which); ceph_osd_data_pagelist_init(osd_data, pagelist); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); #ifdef CONFIG_BLOCK void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, - unsigned int which, bool write_request, - struct bio *bio, size_t bio_length) + unsigned int which, struct bio *bio, size_t bio_length) { struct ceph_osd_data *osd_data; - - osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request); + osd_data = osd_req_op_extent_osd_data(osd_req, which); ceph_osd_data_bio_init(osd_data, bio, bio_length); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); @@ -2284,7 +2281,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, /* it may be a short read due to an object boundary */ - osd_req_op_extent_osd_data_pages(req, 0, false, + osd_req_op_extent_osd_data_pages(req, 0, pages, *plen, page_align, false, false); dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", @@ -2327,7 +2324,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, return PTR_ERR(req); /* it may be a short write due to an object boundary */ - osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, page_align, + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, false, false); dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); @@ -2428,7 +2425,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, * XXX page data. Probably OK for reads, but this * XXX ought to be done more generally. */ - osd_data = osd_req_op_extent_osd_data(req, 0, false); + osd_data = osd_req_op_extent_osd_data(req, 0); if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { if (osd_data->pages && unlikely(osd_data->length < data_len)) { -- cgit v1.2.3 From b155e86cf619886388d80ec298b0f13694c83595 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 15 Apr 2013 14:50:37 -0500 Subject: rbd: adjust image object request ref counting An extra reference is taken when an object request is added as one of the requests making up an image object. A reference is dropped again when the image's object requests get submitted. The original reference for the object request will remain throughout this period, so we don't need to add and then take away an extra one. This can be interpreted as the image request inheriting the original object request's reference. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8e8b876e83c3..81751cd8361e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1167,7 +1167,7 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, { rbd_assert(obj_request->img_request == NULL); - rbd_obj_request_get(obj_request); + /* Image request now owns object's original reference */ obj_request->img_request = img_request; obj_request->which = img_request->obj_request_count; rbd_assert(!obj_request_img_data_test(obj_request)); @@ -1815,12 +1815,6 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) ret = rbd_obj_request_submit(osdc, obj_request); if (ret) return ret; - /* - * The image request has its own reference to each - * of its object requests, so we can safely drop the - * initial one here. - */ - rbd_obj_request_put(obj_request); } return 0; -- cgit v1.2.3 From 57acbaa7fb00b6e1a74d29aaaaf273ed8cb4dabc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Feb 2013 12:33:24 -0600 Subject: rbd: always check IMG_DATA flag In a few spots, whether the an object request's img_request pointer is null is used to determine whether an object request is being done as part of an image data request. Stop doing that, and instead always use the object request IMG_DATA flag for that purpose. Swap the order of the definition of the IMG_DATA and DONE flag helpers, because obj_request_done_set() now refers to obj_request_img_data_set() to get its rbd_dev value. This will become important because the img_request pointer is about to become part of a union. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 81751cd8361e..211baa7f4f0b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1094,40 +1094,39 @@ out_err: * each flag, once its value is set to 1 it is never reset to 0 * again. */ -static void obj_request_done_set(struct rbd_obj_request *obj_request) +static void obj_request_img_data_set(struct rbd_obj_request *obj_request) { - if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { - struct rbd_img_request *img_request = obj_request->img_request; + if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { struct rbd_device *rbd_dev; - rbd_dev = img_request ? img_request->rbd_dev : NULL; - rbd_warn(rbd_dev, "obj_request %p already marked done\n", + rbd_dev = obj_request->img_request->rbd_dev; + rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", obj_request); } } -static bool obj_request_done_test(struct rbd_obj_request *obj_request) +static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) { smp_mb(); - return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; + return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; } -static void obj_request_img_data_set(struct rbd_obj_request *obj_request) +static void obj_request_done_set(struct rbd_obj_request *obj_request) { - if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { - struct rbd_img_request *img_request = obj_request->img_request; - struct rbd_device *rbd_dev; + if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { + struct rbd_device *rbd_dev = NULL; - rbd_dev = img_request ? img_request->rbd_dev : NULL; - rbd_warn(rbd_dev, "obj_request %p already marked img_data\n", + if (obj_request_img_data_test(obj_request)) + rbd_dev = obj_request->img_request->rbd_dev; + rbd_warn(rbd_dev, "obj_request %p already marked done\n", obj_request); } } -static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) +static bool obj_request_done_test(struct rbd_obj_request *obj_request) { smp_mb(); - return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; + return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; } static void rbd_obj_request_get(struct rbd_obj_request *obj_request) @@ -1338,8 +1337,16 @@ static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) { - struct rbd_img_request *img_request = obj_request->img_request; - bool layered = img_request && img_request_layered_test(img_request); + struct rbd_img_request *img_request = NULL; + bool layered = false; + + if (obj_request_img_data_test(obj_request)) { + img_request = obj_request->img_request; + layered = img_request && img_request_layered_test(img_request); + } else { + img_request = NULL; + layered = false; + } dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, obj_request, img_request, obj_request->result, @@ -1382,10 +1389,12 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); rbd_assert(osd_req == obj_request->osd_req); - rbd_assert(obj_request_img_data_test(obj_request) ^ - !obj_request->img_request); - rbd_assert(obj_request_img_data_test(obj_request) ^ - (obj_request->which == BAD_WHICH)); + if (obj_request_img_data_test(obj_request)) { + rbd_assert(obj_request->img_request); + rbd_assert(obj_request->which != BAD_WHICH); + } else { + rbd_assert(obj_request->which == BAD_WHICH); + } if (osd_req->r_result < 0) obj_request->result = osd_req->r_result; -- cgit v1.2.3 From 5679c59f608f2fedff313e59b374257f1c945234 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Feb 2013 12:33:24 -0600 Subject: rbd: add target object existence flags This creates two new flags for object requests to indicate what is known about the existence of the object to which a request is to be sent. The KNOWN flag will be true if the the EXISTS flag is meaningful. That is: KNOWN EXISTS ----- ------ 0 0 don't know whether the object exists 0 1 (not used/invalid) 1 0 object is known to not exist 1 0 object is known to exist This will be used in determining how to handle write requests for data objects for layered rbd images. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 211baa7f4f0b..b1b8ef864d58 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -173,6 +173,8 @@ enum obj_request_type { enum obj_req_flags { OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ + OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ + OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ }; struct rbd_obj_request { @@ -1129,6 +1131,37 @@ static bool obj_request_done_test(struct rbd_obj_request *obj_request) return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; } +/* + * This sets the KNOWN flag after (possibly) setting the EXISTS + * flag. The latter is set based on the "exists" value provided. + * + * Note that for our purposes once an object exists it never goes + * away again. It's possible that the response from two existence + * checks are separated by the creation of the target object, and + * the first ("doesn't exist") response arrives *after* the second + * ("does exist"). In that case we ignore the second one. + */ +static void obj_request_existence_set(struct rbd_obj_request *obj_request, + bool exists) +{ + if (exists) + set_bit(OBJ_REQ_EXISTS, &obj_request->flags); + set_bit(OBJ_REQ_KNOWN, &obj_request->flags); + smp_mb(); +} + +static bool obj_request_known_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; +} + +static bool obj_request_exists_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; +} + static void rbd_obj_request_get(struct rbd_obj_request *obj_request) { dout("%s: obj %p (was %d)\n", __func__, obj_request, @@ -1623,6 +1656,10 @@ static struct rbd_img_request *rbd_img_request_create( INIT_LIST_HEAD(&img_request->obj_requests); kref_init(&img_request->kref); + (void) obj_request_existence_set; + (void) obj_request_known_test; + (void) obj_request_exists_test; + rbd_img_request_get(img_request); /* Avoid a warning */ rbd_img_request_put(img_request); /* TEMPORARY */ -- cgit v1.2.3 From c5b5ef6c51124e61829632251098f8b5efecae8a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Feb 2013 12:33:24 -0600 Subject: rbd: issue stat request before layered write This is a step toward fully implementing layered writes. Add checks before request submission for the object(s) associated with an image request. For write requests, if we don't know that the target object exists, issue a STAT request to find out. When that request completes, mark the known and exists flags for the original object request accordingly and re-submit the object request. (Note that this still does the existence check only; the copyup operation is not yet done.) A new object request is created to perform the existence check. A pointer to the original request is added to that object request to allow the stat request to re-issue the original request after updating its flags. If there is a failure with the stat request the error code is stored with the original request, which is then completed. This resolves: http://tracker.ceph.com/issues/3418 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 155 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b1b8ef864d58..449847badcd8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -183,9 +183,31 @@ struct rbd_obj_request { u64 length; /* bytes from offset */ unsigned long flags; - struct rbd_img_request *img_request; - u64 img_offset; /* image relative offset */ - struct list_head links; /* img_request->obj_requests */ + /* + * An object request associated with an image will have its + * img_data flag set; a standalone object request will not. + * + * A standalone object request will have which == BAD_WHICH + * and a null obj_request pointer. + * + * An object request initiated in support of a layered image + * object (to check for its existence before a write) will + * have which == BAD_WHICH and a non-null obj_request pointer. + * + * Finally, an object request for rbd image data will have + * which != BAD_WHICH, and will have a non-null img_request + * pointer. The value of which will be in the range + * 0..(img_request->obj_request_count-1). + */ + union { + struct rbd_obj_request *obj_request; /* STAT op */ + struct { + struct rbd_img_request *img_request; + u64 img_offset; + /* links for img_request->obj_requests list */ + struct list_head links; + }; + }; u32 which; /* posn image request list */ enum obj_request_type type; @@ -1656,10 +1678,6 @@ static struct rbd_img_request *rbd_img_request_create( INIT_LIST_HEAD(&img_request->obj_requests); kref_init(&img_request->kref); - (void) obj_request_existence_set; - (void) obj_request_known_test; - (void) obj_request_exists_test; - rbd_img_request_get(img_request); /* Avoid a warning */ rbd_img_request_put(img_request); /* TEMPORARY */ @@ -1847,18 +1865,147 @@ out_unwind: return -ENOMEM; } +static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_device *rbd_dev; + struct ceph_osd_client *osdc; + struct rbd_obj_request *orig_request; + int result; + + rbd_assert(!obj_request_img_data_test(obj_request)); + + /* + * All we need from the object request is the original + * request and the result of the STAT op. Grab those, then + * we're done with the request. + */ + orig_request = obj_request->obj_request; + obj_request->obj_request = NULL; + rbd_assert(orig_request); + rbd_assert(orig_request->img_request); + + result = obj_request->result; + obj_request->result = 0; + + dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, + obj_request, orig_request, result, + obj_request->xferred, obj_request->length); + rbd_obj_request_put(obj_request); + + rbd_assert(orig_request); + rbd_assert(orig_request->img_request); + rbd_dev = orig_request->img_request->rbd_dev; + osdc = &rbd_dev->rbd_client->client->osdc; + + /* + * Our only purpose here is to determine whether the object + * exists, and we don't want to treat the non-existence as + * an error. If something else comes back, transfer the + * error to the original request and complete it now. + */ + if (!result) { + obj_request_existence_set(orig_request, true); + } else if (result == -ENOENT) { + obj_request_existence_set(orig_request, false); + } else if (result) { + orig_request->result = result; + goto out_err; + } + + /* + * Resubmit the original request now that we have recorded + * whether the target object exists. + */ + orig_request->result = rbd_obj_request_submit(osdc, orig_request); +out_err: + if (orig_request->result) + rbd_obj_request_complete(orig_request); + rbd_obj_request_put(orig_request); +} + +static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) +{ + struct rbd_obj_request *stat_request; + struct rbd_device *rbd_dev; + struct ceph_osd_client *osdc; + struct page **pages = NULL; + u32 page_count; + size_t size; + int ret; + + /* + * The response data for a STAT call consists of: + * le64 length; + * struct { + * le32 tv_sec; + * le32 tv_nsec; + * } mtime; + */ + size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); + page_count = (u32)calc_pages_for(0, size); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = -ENOMEM; + stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, + OBJ_REQUEST_PAGES); + if (!stat_request) + goto out; + + rbd_obj_request_get(obj_request); + stat_request->obj_request = obj_request; + stat_request->pages = pages; + stat_request->page_count = page_count; + + rbd_assert(obj_request->img_request); + rbd_dev = obj_request->img_request->rbd_dev; + stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, + stat_request); + if (!stat_request->osd_req) + goto out; + stat_request->callback = rbd_img_obj_exists_callback; + + osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); + osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, + false, false); + rbd_osd_req_format(stat_request, false); + + osdc = &rbd_dev->rbd_client->client->osdc; + ret = rbd_obj_request_submit(osdc, stat_request); +out: + if (ret) + rbd_obj_request_put(obj_request); + + return ret; +} + static int rbd_img_request_submit(struct rbd_img_request *img_request) { struct rbd_device *rbd_dev = img_request->rbd_dev; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; struct rbd_obj_request *next_obj_request; + bool write_request = img_request_write_test(img_request); + bool layered = img_request_layered_test(img_request); dout("%s: img %p\n", __func__, img_request); for_each_obj_request_safe(img_request, obj_request, next_obj_request) { + bool known; + bool object_exists; int ret; - ret = rbd_obj_request_submit(osdc, obj_request); + /* + * We need to know whether the target object exists + * for a layered write. Issue an existence check + * first if we need to. + */ + known = obj_request_known_test(obj_request); + object_exists = known && obj_request_exists_test(obj_request); + if (!write_request || !layered || object_exists) + ret = rbd_obj_request_submit(osdc, obj_request); + else + ret = rbd_img_obj_exists_submit(obj_request); if (ret) return ret; } -- cgit v1.2.3 From 9d4df01f08e2f2a777f3476741ff4ef8afb04be6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 19 Apr 2013 15:34:50 -0500 Subject: rbd: define separate read and write format funcs Separate rbd_osd_req_format() into two functions, one for read requests and the other for write requests. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 449847badcd8..e15c70e3f860 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1489,28 +1489,31 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, rbd_obj_request_complete(obj_request); } -static void rbd_osd_req_format(struct rbd_obj_request *obj_request, - bool write_request) +static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request = obj_request->img_request; struct ceph_osd_request *osd_req = obj_request->osd_req; - struct ceph_snap_context *snapc = NULL; - u64 snap_id = CEPH_NOSNAP; - struct timespec *mtime = NULL; - struct timespec now; + u64 snap_id; rbd_assert(osd_req != NULL); - if (write_request) { - now = CURRENT_TIME; - mtime = &now; - if (img_request) - snapc = img_request->snapc; - } else if (img_request) { - snap_id = img_request->snap_id; - } + snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; + ceph_osdc_build_request(osd_req, obj_request->offset, + NULL, snap_id, NULL); +} + +static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_osd_request *osd_req = obj_request->osd_req; + struct ceph_snap_context *snapc; + struct timespec mtime = CURRENT_TIME; + + rbd_assert(osd_req != NULL); + + snapc = img_request ? img_request->snapc : NULL; ceph_osdc_build_request(osd_req, obj_request->offset, - snapc, snap_id, mtime); + snapc, CEPH_NOSNAP, &mtime); } static struct ceph_osd_request *rbd_osd_req_create( @@ -1845,7 +1848,11 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, 0, 0); osd_req_op_extent_osd_data_bio(osd_req, 0, obj_request->bio_list, obj_request->length); - rbd_osd_req_format(obj_request, write_request); + + if (write_request) + rbd_osd_req_format_write(obj_request); + else + rbd_osd_req_format_read(obj_request); obj_request->img_offset = img_offset; rbd_img_obj_request_add(img_request, obj_request); @@ -1969,7 +1976,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, false, false); - rbd_osd_req_format(stat_request, false); + rbd_osd_req_format_read(stat_request); osdc = &rbd_dev->rbd_client->client->osdc; ret = rbd_obj_request_submit(osdc, stat_request); @@ -2091,7 +2098,7 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0); - rbd_osd_req_format(obj_request, false); + rbd_osd_req_format_read(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); out: @@ -2161,7 +2168,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, rbd_dev->watch_event->cookie, rbd_dev->header.obj_version, start); - rbd_osd_req_format(obj_request, true); + rbd_osd_req_format_write(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); if (ret) @@ -2262,7 +2269,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, obj_request->pages, inbound_size, 0, false, false); - rbd_osd_req_format(obj_request, false); + rbd_osd_req_format_read(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); if (ret) @@ -2473,7 +2480,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, obj_request->length, obj_request->offset & ~PAGE_MASK, false, false); - rbd_osd_req_format(obj_request, false); + rbd_osd_req_format_read(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); if (ret) -- cgit v1.2.3 From b454e36d2638c005c6574c2289529f5738f156cb Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 19 Apr 2013 15:34:50 -0500 Subject: rbd: encapsulate submission of image object requests Object requests that are part of an image request are subject to some additional handling. Define rbd_img_obj_request_submit() to encapsulate that, and use it when initially submitting an image object request, and when re-submitting it during callback of an object existence check. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 65 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 22 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e15c70e3f860..e208cec808dc 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -423,6 +423,7 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) #endif /* !RBD_DEBUG */ static void rbd_img_parent_read(struct rbd_obj_request *obj_request); +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); @@ -1874,8 +1875,6 @@ out_unwind: static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) { - struct rbd_device *rbd_dev; - struct ceph_osd_client *osdc; struct rbd_obj_request *orig_request; int result; @@ -1901,8 +1900,6 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) rbd_assert(orig_request); rbd_assert(orig_request->img_request); - rbd_dev = orig_request->img_request->rbd_dev; - osdc = &rbd_dev->rbd_client->client->osdc; /* * Our only purpose here is to determine whether the object @@ -1923,7 +1920,7 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) * Resubmit the original request now that we have recorded * whether the target object exists. */ - orig_request->result = rbd_obj_request_submit(osdc, orig_request); + orig_request->result = rbd_img_obj_request_submit(orig_request); out_err: if (orig_request->result) rbd_obj_request_complete(orig_request); @@ -1987,32 +1984,56 @@ out: return ret; } +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + + rbd_assert(obj_request_img_data_test(obj_request)); + + img_request = obj_request->img_request; + rbd_assert(img_request); + + /* (At the moment we don't care whether it exists or not...) */ + (void) obj_request_exists_test; + + /* + * Only layered writes need special handling. If it's not a + * layered write, or it is a layered write but we know the + * target object exists, it's no different from any other + * object request. + */ + if (!img_request_write_test(img_request) || + !img_request_layered_test(img_request) || + obj_request_known_test(obj_request)) { + + struct rbd_device *rbd_dev; + struct ceph_osd_client *osdc; + + rbd_dev = obj_request->img_request->rbd_dev; + osdc = &rbd_dev->rbd_client->client->osdc; + + return rbd_obj_request_submit(osdc, obj_request); + } + + /* + * It's a layered write and we don't know whether the target + * exists. Issue existence check; once that completes the + * original request will be submitted again. + */ + + return rbd_img_obj_exists_submit(obj_request); +} + static int rbd_img_request_submit(struct rbd_img_request *img_request) { - struct rbd_device *rbd_dev = img_request->rbd_dev; - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; struct rbd_obj_request *next_obj_request; - bool write_request = img_request_write_test(img_request); - bool layered = img_request_layered_test(img_request); dout("%s: img %p\n", __func__, img_request); for_each_obj_request_safe(img_request, obj_request, next_obj_request) { - bool known; - bool object_exists; int ret; - /* - * We need to know whether the target object exists - * for a layered write. Issue an existence check - * first if we need to. - */ - known = obj_request_known_test(obj_request); - object_exists = known && obj_request_exists_test(obj_request); - if (!write_request || !layered || object_exists) - ret = rbd_obj_request_submit(osdc, obj_request); - else - ret = rbd_img_obj_exists_submit(obj_request); + ret = rbd_img_obj_request_submit(obj_request); if (ret) return ret; } -- cgit v1.2.3 From b9434c5b43d1a90e762fe64169862fb198746935 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 19 Apr 2013 15:34:50 -0500 Subject: rbd: define zero_pages() Define a new function zero_pages() that zeroes a range of memory defined by a page array, along the lines of zero_bio_chain(). It saves and the irq flags like bvec_kmap_irq() does, though I'm not sure at this point that it's necessary. Update rbd_img_obj_request_read_callback() to use the new function if the object request contains page rather than bio data. For the moment, only bio data is used for osd READ ops. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 55 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e208cec808dc..06bbd55c0ea1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -970,6 +970,37 @@ static void zero_bio_chain(struct bio *chain, int start_ofs) } } +/* + * similar to zero_bio_chain(), zeros data defined by a page array, + * starting at the given byte offset from the start of the array and + * continuing up to the given end offset. The pages array is + * assumed to be big enough to hold all bytes up to the end. + */ +static void zero_pages(struct page **pages, u64 offset, u64 end) +{ + struct page **page = &pages[offset >> PAGE_SHIFT]; + + rbd_assert(end > offset); + rbd_assert(end - offset <= (u64)SIZE_MAX); + while (offset < end) { + size_t page_offset; + size_t length; + unsigned long flags; + void *kaddr; + + page_offset = (size_t)(offset & ~PAGE_MASK); + length = min(PAGE_SIZE - page_offset, (size_t)(end - offset)); + local_irq_save(flags); + kaddr = kmap_atomic(*page); + memset(kaddr + page_offset, 0, length); + kunmap_atomic(kaddr); + local_irq_restore(flags); + + offset += length; + page++; + } +} + /* * Clone a portion of a bio, starting at the given byte offset * and continuing for the number of bytes indicated. @@ -1352,9 +1383,12 @@ static bool img_request_layered_test(struct rbd_img_request *img_request) static void rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) { + u64 xferred = obj_request->xferred; + u64 length = obj_request->length; + dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, obj_request, obj_request->img_request, obj_request->result, - obj_request->xferred, obj_request->length); + xferred, length); /* * ENOENT means a hole in the image. We zero-fill the * entire length of the request. A short read also implies @@ -1362,15 +1396,20 @@ rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) * update the xferred count to indicate the whole request * was satisfied. */ - BUG_ON(obj_request->type != OBJ_REQUEST_BIO); + rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); if (obj_request->result == -ENOENT) { - zero_bio_chain(obj_request->bio_list, 0); + if (obj_request->type == OBJ_REQUEST_BIO) + zero_bio_chain(obj_request->bio_list, 0); + else + zero_pages(obj_request->pages, 0, length); obj_request->result = 0; - obj_request->xferred = obj_request->length; - } else if (obj_request->xferred < obj_request->length && - !obj_request->result) { - zero_bio_chain(obj_request->bio_list, obj_request->xferred); - obj_request->xferred = obj_request->length; + obj_request->xferred = length; + } else if (xferred < length && !obj_request->result) { + if (obj_request->type == OBJ_REQUEST_BIO) + zero_bio_chain(obj_request->bio_list, xferred); + else + zero_pages(obj_request->pages, xferred, length); + obj_request->xferred = length; } obj_request_done_set(obj_request); } -- cgit v1.2.3 From f1a4739f333b519fe041e1ad81d9b31c94b9d6a3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 19 Apr 2013 15:34:50 -0500 Subject: rbd: support page array image requests This patch adds the ability to build an image request whose data will be written from or read into memory described by a page array. (Previously only bio lists were supported.) Originally this was going to define a new function for this purpose but it was largely identical to the rbd_img_request_fill_bio(). So instead, rbd_img_request_fill_bio() has been generalized to handle both types of image request. For the moment we still only fill image requests with bio data. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 86 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 06bbd55c0ea1..8a7216d784d7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1780,6 +1780,13 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) img_request->result = result; } + /* Image object requests don't own their page array */ + + if (obj_request->type == OBJ_REQUEST_PAGES) { + obj_request->pages = NULL; + obj_request->page_count = 0; + } + if (img_request_child_test(img_request)) { rbd_assert(img_request->obj_request != NULL); more = obj_request->which < img_request->obj_request_count - 1; @@ -1830,30 +1837,48 @@ out: rbd_img_request_complete(img_request); } -static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, - struct bio *bio_list) +/* + * Split up an image request into one or more object requests, each + * to a different object. The "type" parameter indicates whether + * "data_desc" is the pointer to the head of a list of bio + * structures, or the base of a page array. In either case this + * function assumes data_desc describes memory sufficient to hold + * all data described by the image request. + */ +static int rbd_img_request_fill(struct rbd_img_request *img_request, + enum obj_request_type type, + void *data_desc) { struct rbd_device *rbd_dev = img_request->rbd_dev; struct rbd_obj_request *obj_request = NULL; struct rbd_obj_request *next_obj_request; bool write_request = img_request_write_test(img_request); - unsigned int bio_offset; + struct bio *bio_list; + unsigned int bio_offset = 0; + struct page **pages; u64 img_offset; u64 resid; u16 opcode; - dout("%s: img %p bio %p\n", __func__, img_request, bio_list); + dout("%s: img %p type %d data_desc %p\n", __func__, img_request, + (int)type, data_desc); opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; - bio_offset = 0; img_offset = img_request->offset; - rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); resid = img_request->length; rbd_assert(resid > 0); + + if (type == OBJ_REQUEST_BIO) { + bio_list = data_desc; + rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); + } else { + rbd_assert(type == OBJ_REQUEST_PAGES); + pages = data_desc; + } + while (resid) { struct ceph_osd_request *osd_req; const char *object_name; - unsigned int clone_size; u64 offset; u64 length; @@ -1863,19 +1888,33 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, offset = rbd_segment_offset(rbd_dev, img_offset); length = rbd_segment_length(rbd_dev, img_offset, resid); obj_request = rbd_obj_request_create(object_name, - offset, length, - OBJ_REQUEST_BIO); + offset, length, type); kfree(object_name); /* object request has its own copy */ if (!obj_request) goto out_unwind; - rbd_assert(length <= (u64) UINT_MAX); - clone_size = (unsigned int) length; - obj_request->bio_list = bio_chain_clone_range(&bio_list, - &bio_offset, clone_size, - GFP_ATOMIC); - if (!obj_request->bio_list) - goto out_partial; + if (type == OBJ_REQUEST_BIO) { + unsigned int clone_size; + + rbd_assert(length <= (u64)UINT_MAX); + clone_size = (unsigned int)length; + obj_request->bio_list = + bio_chain_clone_range(&bio_list, + &bio_offset, + clone_size, + GFP_ATOMIC); + if (!obj_request->bio_list) + goto out_partial; + } else { + unsigned int page_count; + + obj_request->pages = pages; + page_count = (u32)calc_pages_for(offset, length); + obj_request->page_count = page_count; + if ((offset + length) & ~PAGE_MASK) + page_count--; /* more on last page */ + pages += page_count; + } osd_req = rbd_osd_req_create(rbd_dev, write_request, obj_request); @@ -1886,8 +1925,13 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, osd_req_op_extent_init(osd_req, 0, opcode, offset, length, 0, 0); - osd_req_op_extent_osd_data_bio(osd_req, 0, - obj_request->bio_list, obj_request->length); + if (type == OBJ_REQUEST_BIO) + osd_req_op_extent_osd_data_bio(osd_req, 0, + obj_request->bio_list, length); + else + osd_req_op_extent_osd_data_pages(osd_req, 0, + obj_request->pages, length, + offset & ~PAGE_MASK, false, false); if (write_request) rbd_osd_req_format_write(obj_request); @@ -2120,7 +2164,8 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request) rbd_obj_request_get(obj_request); img_request->obj_request = obj_request; - result = rbd_img_request_fill_bio(img_request, obj_request->bio_list); + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, + obj_request->bio_list); if (result) goto out_err; @@ -2425,7 +2470,8 @@ static void rbd_request_fn(struct request_queue *q) img_request->rq = rq; - result = rbd_img_request_fill_bio(img_request, rq->bio); + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, + rq->bio); if (!result) result = rbd_img_request_submit(img_request); if (result) -- cgit v1.2.3 From d98df63ea7e87d5df4dce0cece0210e2a777ac00 Mon Sep 17 00:00:00 2001 From: Laurent Barbe Date: Wed, 10 Apr 2013 17:47:46 -0500 Subject: rbd: revalidate_disk upon rbd resize If rbd disk is open and rbd resize is done, new size is not visible by filesystem. Like is done in virtio-blk and dm driver, revalidate_disk() permits to update the bd_inode size. Signed-off-by: Laurent Barbe Reviewed-by: Alex Elder --- drivers/block/rbd.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8a7216d784d7..b2819deced6b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2781,6 +2781,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) else ret = rbd_dev_v2_refresh(rbd_dev, hver); mutex_unlock(&ctl_mutex); + revalidate_disk(rbd_dev->disk); return ret; } -- cgit v1.2.3 From 3d7efd18d9df628e30ff36e9e488a8f0e782b678 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 19 Apr 2013 15:34:50 -0500 Subject: rbd: implement full object parent reads As a step toward implementing layered writes, implement reading the data for a target object from the parent image for a write request whose target object is known to not exist. Add a copyup_pages field to an image request to track the page array used (only) for such a request. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 143 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b2819deced6b..639dd91e7dab 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -250,6 +250,7 @@ struct rbd_img_request { struct request *rq; /* block request */ struct rbd_obj_request *obj_request; /* obj req initiator */ }; + struct page **copyup_pages; spinlock_t completion_lock;/* protects next_completion */ u32 next_completion; rbd_img_callback_t callback; @@ -350,6 +351,8 @@ static DEFINE_SPINLOCK(rbd_dev_list_lock); static LIST_HEAD(rbd_client_list); /* clients */ static DEFINE_SPINLOCK(rbd_client_list_lock); +static int rbd_img_request_submit(struct rbd_img_request *img_request); + static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); @@ -1956,6 +1959,133 @@ out_unwind: return -ENOMEM; } +static void +rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) +{ + struct rbd_obj_request *orig_request; + struct page **pages; + u32 page_count; + int result; + u64 obj_size; + u64 xferred; + + rbd_assert(img_request_child_test(img_request)); + + /* First get what we need from the image request */ + + pages = img_request->copyup_pages; + rbd_assert(pages != NULL); + img_request->copyup_pages = NULL; + + orig_request = img_request->obj_request; + rbd_assert(orig_request != NULL); + + result = img_request->result; + obj_size = img_request->length; + xferred = img_request->xferred; + + rbd_img_request_put(img_request); + + obj_request_existence_set(orig_request, true); + + page_count = (u32)calc_pages_for(0, obj_size); + ceph_release_page_vector(pages, page_count); + + /* Resubmit the original request (for now). */ + + orig_request->result = rbd_img_obj_request_submit(orig_request); + if (orig_request->result) { + obj_request_done_set(orig_request); + rbd_obj_request_complete(orig_request); + } +} + +/* + * Read from the parent image the range of data that covers the + * entire target of the given object request. This is used for + * satisfying a layered image write request when the target of an + * object request from the image request does not exist. + * + * A page array big enough to hold the returned data is allocated + * and supplied to rbd_img_request_fill() as the "data descriptor." + * When the read completes, this page array will be transferred to + * the original object request for the copyup operation. + * + * If an error occurs, record it as the result of the original + * object request and mark it done so it gets completed. + */ +static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = NULL; + struct rbd_img_request *parent_request = NULL; + struct rbd_device *rbd_dev; + u64 img_offset; + u64 length; + struct page **pages = NULL; + u32 page_count; + int result; + + rbd_assert(obj_request_img_data_test(obj_request)); + rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + + img_request = obj_request->img_request; + rbd_assert(img_request != NULL); + rbd_dev = img_request->rbd_dev; + rbd_assert(rbd_dev->parent != NULL); + + /* + * Determine the byte range covered by the object in the + * child image to which the original request was to be sent. + */ + img_offset = obj_request->img_offset - obj_request->offset; + length = (u64)1 << rbd_dev->header.obj_order; + + /* + * Allocate a page array big enough to receive the data read + * from the parent. + */ + page_count = (u32)calc_pages_for(0, length); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) { + result = PTR_ERR(pages); + pages = NULL; + goto out_err; + } + + result = -ENOMEM; + parent_request = rbd_img_request_create(rbd_dev->parent, + img_offset, length, + false, true); + if (!parent_request) + goto out_err; + rbd_obj_request_get(obj_request); + parent_request->obj_request = obj_request; + + result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); + if (result) + goto out_err; + parent_request->copyup_pages = pages; + + parent_request->callback = rbd_img_obj_parent_read_full_callback; + result = rbd_img_request_submit(parent_request); + if (!result) + return 0; + + parent_request->copyup_pages = NULL; + parent_request->obj_request = NULL; + rbd_obj_request_put(obj_request); +out_err: + if (pages) + ceph_release_page_vector(pages, page_count); + if (parent_request) + rbd_img_request_put(parent_request); + obj_request->result = result; + obj_request->xferred = 0; + obj_request_done_set(obj_request); + + return result; +} + static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) { struct rbd_obj_request *orig_request; @@ -1996,7 +2126,7 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) obj_request_existence_set(orig_request, false); } else if (result) { orig_request->result = result; - goto out_err; + goto out; } /* @@ -2004,7 +2134,7 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) * whether the target object exists. */ orig_request->result = rbd_img_obj_request_submit(orig_request); -out_err: +out: if (orig_request->result) rbd_obj_request_complete(orig_request); rbd_obj_request_put(orig_request); @@ -2070,15 +2200,13 @@ out: static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; + bool known; rbd_assert(obj_request_img_data_test(obj_request)); img_request = obj_request->img_request; rbd_assert(img_request); - /* (At the moment we don't care whether it exists or not...) */ - (void) obj_request_exists_test; - /* * Only layered writes need special handling. If it's not a * layered write, or it is a layered write but we know the @@ -2087,7 +2215,8 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) */ if (!img_request_write_test(img_request) || !img_request_layered_test(img_request) || - obj_request_known_test(obj_request)) { + ((known = obj_request_known_test(obj_request)) && + obj_request_exists_test(obj_request))) { struct rbd_device *rbd_dev; struct ceph_osd_client *osdc; @@ -2099,10 +2228,15 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) } /* - * It's a layered write and we don't know whether the target - * exists. Issue existence check; once that completes the - * original request will be submitted again. + * It's a layered write. The target object might exist but + * we may not know that yet. If we know it doesn't exist, + * start by reading the data for the full target object from + * the parent so we can use it for a copyup to the target. */ + if (known) + return rbd_img_obj_parent_read_full(obj_request); + + /* We don't know whether the target exists. Go find out. */ return rbd_img_obj_exists_submit(obj_request); } -- cgit v1.2.3 From 0eefd470f034cc18349fa1a9e4fda000e963c4e3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 19 Apr 2013 15:34:50 -0500 Subject: rbd: issue a copyup for layered writes This implements the main copyup functionality for layered writes. Here we add a copyup_pages field to the object request, which is used only for copyup requests to keep track of the page array containing data read from the parent image. A copyup request is currently the only request rbd has that requires two osd operations. Because of this we handle copyup specially. All image object requests get an osd request allocated when they are created. For a write request, if a copyup is required, the osd request originally allocated is released, and a new one (with room for two osd ops) is allocated to replace it. A new function rbd_osd_req_create_copyup() allocates an osd request suitable for a copyup request. The first op is then filled with a copyup object class method call, supplying the array of pages containing data read from the parent. The second op is filled in with the original write request. The original request otherwise remains intact, and it describes the original write request (found in the second osd op). The presence of the copyup op is sort of implicit; a non-null copyup_pages field could be used to distinguish between a "normal" write request and a request containing both a copyup call and a write. This resolves: http://tracker.ceph.com/issues/3419 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 137 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 639dd91e7dab..c34719c917b1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -218,6 +218,7 @@ struct rbd_obj_request { u32 page_count; }; }; + struct page **copyup_pages; struct ceph_osd_request *osd_req; @@ -1498,7 +1499,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, obj_request->result = osd_req->r_result; obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); - WARN_ON(osd_req->r_num_ops != 1); /* For now */ + BUG_ON(osd_req->r_num_ops > 2); /* * We support a 64-bit length, but ultimately it has to be @@ -1601,6 +1602,48 @@ static struct ceph_osd_request *rbd_osd_req_create( return osd_req; } +/* + * Create a copyup osd request based on the information in the + * object request supplied. A copyup request has two osd ops, + * a copyup method call, and a "normal" write request. + */ +static struct ceph_osd_request * +rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + struct ceph_snap_context *snapc; + struct rbd_device *rbd_dev; + struct ceph_osd_client *osdc; + struct ceph_osd_request *osd_req; + + rbd_assert(obj_request_img_data_test(obj_request)); + img_request = obj_request->img_request; + rbd_assert(img_request); + rbd_assert(img_request_write_test(img_request)); + + /* Allocate and initialize the request, for the two ops */ + + snapc = img_request->snapc; + rbd_dev = img_request->rbd_dev; + osdc = &rbd_dev->rbd_client->client->osdc; + osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC); + if (!osd_req) + return NULL; /* ENOMEM */ + + osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + osd_req->r_callback = rbd_osd_req_callback; + osd_req->r_priv = obj_request; + + osd_req->r_oid_len = strlen(obj_request->object_name); + rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); + memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); + + osd_req->r_file_layout = rbd_dev->layout; /* struct */ + + return osd_req; +} + + static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) { ceph_osdc_put_request(osd_req); @@ -1959,12 +2002,50 @@ out_unwind: return -ENOMEM; } +static void +rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + struct rbd_device *rbd_dev; + u64 length; + u32 page_count; + + rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + rbd_assert(obj_request_img_data_test(obj_request)); + img_request = obj_request->img_request; + rbd_assert(img_request); + + rbd_dev = img_request->rbd_dev; + rbd_assert(rbd_dev); + length = (u64)1 << rbd_dev->header.obj_order; + page_count = (u32)calc_pages_for(0, length); + + rbd_assert(obj_request->copyup_pages); + ceph_release_page_vector(obj_request->copyup_pages, page_count); + obj_request->copyup_pages = NULL; + + /* + * We want the transfer count to reflect the size of the + * original write request. There is no such thing as a + * successful short write, so if the request was successful + * we can just set it to the originally-requested length. + */ + if (!obj_request->result) + obj_request->xferred = obj_request->length; + + /* Finish up with the normal image object callback */ + + rbd_img_obj_callback(obj_request); +} + static void rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) { struct rbd_obj_request *orig_request; + struct ceph_osd_request *osd_req; + struct ceph_osd_client *osdc; + struct rbd_device *rbd_dev; struct page **pages; - u32 page_count; int result; u64 obj_size; u64 xferred; @@ -1979,25 +2060,60 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) orig_request = img_request->obj_request; rbd_assert(orig_request != NULL); - + rbd_assert(orig_request->type == OBJ_REQUEST_BIO); result = img_request->result; obj_size = img_request->length; xferred = img_request->xferred; + rbd_dev = img_request->rbd_dev; + rbd_assert(rbd_dev); + rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order); + rbd_img_request_put(img_request); - obj_request_existence_set(orig_request, true); + if (result) + goto out_err; + + /* Allocate the new copyup osd request for the original request */ - page_count = (u32)calc_pages_for(0, obj_size); - ceph_release_page_vector(pages, page_count); + result = -ENOMEM; + rbd_assert(!orig_request->osd_req); + osd_req = rbd_osd_req_create_copyup(orig_request); + if (!osd_req) + goto out_err; + orig_request->osd_req = osd_req; + orig_request->copyup_pages = pages; - /* Resubmit the original request (for now). */ + /* Initialize the copyup op */ - orig_request->result = rbd_img_obj_request_submit(orig_request); - if (orig_request->result) { - obj_request_done_set(orig_request); - rbd_obj_request_complete(orig_request); - } + osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); + osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0, + false, false); + + /* Then the original write request op */ + + osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, + orig_request->offset, + orig_request->length, 0, 0); + osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list, + orig_request->length); + + rbd_osd_req_format_write(orig_request); + + /* All set, send it off. */ + + orig_request->callback = rbd_img_obj_copyup_callback; + osdc = &rbd_dev->rbd_client->client->osdc; + result = rbd_obj_request_submit(osdc, orig_request); + if (!result) + return; +out_err: + /* Record the error code and complete the request */ + + orig_request->result = result; + orig_request->xferred = 0; + obj_request_done_set(orig_request); + rbd_obj_request_complete(orig_request); } /* @@ -2033,6 +2149,15 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) rbd_dev = img_request->rbd_dev; rbd_assert(rbd_dev->parent != NULL); + /* + * First things first. The original osd request is of no + * use to use any more, we'll need a new one that can hold + * the two ops in a copyup request. We'll get that later, + * but for now we can release the old one. + */ + rbd_osd_req_destroy(obj_request->osd_req); + obj_request->osd_req = NULL; + /* * Determine the byte range covered by the object in the * child image to which the original request was to be sent. -- cgit v1.2.3 From a9e8ba2cb3eb64cf6cfa509d096ef79bc1c827ae Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 21 Apr 2013 00:32:07 -0500 Subject: rbd: enforce parent overlap A clone image has a defined overlap point with its parent image. That is the byte offset beyond which the parent image has no defined data to back the clone, and anything thereafter can be viewed as being zero-filled by the clone image. This is needed because a clone image can be resized. If it gets resized larger than the snapshot it is based on, the overlap defines the original size. If the clone gets resized downward below the original size the new clone size defines the overlap. If the clone is subsequently resized to be larger, the overlap won't be increased because the previous resize invalidated any parent data beyond that point. This resolves: http://tracker.ceph.com/issues/4724 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c34719c917b1..ee53d8e52801 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1437,20 +1437,20 @@ static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request = NULL; + struct rbd_device *rbd_dev = NULL; bool layered = false; if (obj_request_img_data_test(obj_request)) { img_request = obj_request->img_request; layered = img_request && img_request_layered_test(img_request); - } else { - img_request = NULL; - layered = false; + rbd_dev = img_request->rbd_dev; } dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, obj_request, img_request, obj_request->result, obj_request->xferred, obj_request->length); - if (layered && obj_request->result == -ENOENT) + if (layered && obj_request->result == -ENOENT && + obj_request->img_offset < rbd_dev->parent_overlap) rbd_img_parent_read(obj_request); else if (img_request) rbd_img_obj_request_read_callback(obj_request); @@ -2165,6 +2165,16 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) img_offset = obj_request->img_offset - obj_request->offset; length = (u64)1 << rbd_dev->header.obj_order; + /* + * There is no defined parent data beyond the parent + * overlap, so limit what we read at that boundary if + * necessary. + */ + if (img_offset + length > rbd_dev->parent_overlap) { + rbd_assert(img_offset < rbd_dev->parent_overlap); + length = rbd_dev->parent_overlap - img_offset; + } + /* * Allocate a page array big enough to receive the data read * from the parent. @@ -2325,21 +2335,28 @@ out: static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; + struct rbd_device *rbd_dev; bool known; rbd_assert(obj_request_img_data_test(obj_request)); img_request = obj_request->img_request; rbd_assert(img_request); + rbd_dev = img_request->rbd_dev; /* - * Only layered writes need special handling. If it's not a - * layered write, or it is a layered write but we know the - * target object exists, it's no different from any other - * object request. + * Only writes to layered images need special handling. + * Reads and non-layered writes are simple object requests. + * Layered writes that start beyond the end of the overlap + * with the parent have no parent data, so they too are + * simple object requests. Finally, if the target object is + * known to already exist, its parent data has already been + * copied, so a write to the object can also be handled as a + * simple object request. */ if (!img_request_write_test(img_request) || !img_request_layered_test(img_request) || + rbd_dev->parent_overlap <= obj_request->img_offset || ((known = obj_request_known_test(obj_request)) && obj_request_exists_test(obj_request))) { @@ -2386,14 +2403,41 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) { struct rbd_obj_request *obj_request; + struct rbd_device *rbd_dev; + u64 obj_end; rbd_assert(img_request_child_test(img_request)); obj_request = img_request->obj_request; - rbd_assert(obj_request != NULL); + rbd_assert(obj_request); + rbd_assert(obj_request->img_request); + obj_request->result = img_request->result; - obj_request->xferred = img_request->xferred; + if (obj_request->result) + goto out; + /* + * We need to zero anything beyond the parent overlap + * boundary. Since rbd_img_obj_request_read_callback() + * will zero anything beyond the end of a short read, an + * easy way to do this is to pretend the data from the + * parent came up short--ending at the overlap boundary. + */ + rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); + obj_end = obj_request->img_offset + obj_request->length; + rbd_dev = obj_request->img_request->rbd_dev; + if (obj_end > rbd_dev->parent_overlap) { + u64 xferred = 0; + + if (obj_request->img_offset < rbd_dev->parent_overlap) + xferred = rbd_dev->parent_overlap - + obj_request->img_offset; + + obj_request->xferred = min(img_request->xferred, xferred); + } else { + obj_request->xferred = img_request->xferred; + } +out: rbd_img_obj_request_read_callback(obj_request); rbd_obj_request_complete(obj_request); } -- cgit v1.2.3 From 80ef15bf71a8ed40e47238e1f4f8b3f2a41f58fe Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 21 Apr 2013 12:14:45 -0500 Subject: rbd: give rbd_obj_read_sync() buffer void type Make the buf parameter into which the data is to be read have type void pointer. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ee53d8e52801..6436b3ff5470 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2854,7 +2854,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) static int rbd_obj_read_sync(struct rbd_device *rbd_dev, const char *object_name, u64 offset, u64 length, - char *buf, u64 *version) + void *buf, u64 *version) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; @@ -2957,8 +2957,7 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) return ERR_PTR(-ENOMEM); ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, - 0, size, - (char *) ondisk, version); + 0, size, ondisk, version); if (ret < 0) goto out_err; if (WARN_ON((size_t) ret < size)) { -- cgit v1.2.3 From 4157976b27287e239d5ae879d2916540fe0b576e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 21 Apr 2013 12:14:45 -0500 Subject: rbd: void data pointers for rbd_obj_method_sync() Make the inbound and outbound data parameters have void rather than character type for rbd_obj_method_sync(). This makes it more clear they don't expect typed data, and eliminates the need for some silly type casts. One more unrelated change: define the features buffer used in _rbd_dev_v2_snap_features() to be a packed data structure. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6436b3ff5470..91b4b741efda 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2623,9 +2623,9 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, const char *object_name, const char *class_name, const char *method_name, - const char *outbound, + const void *outbound, size_t outbound_size, - char *inbound, + void *inbound, size_t inbound_size, u64 *version) { @@ -3578,8 +3578,8 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_size", - (char *) &snapid, sizeof (snapid), - (char *) &size_buf, sizeof (size_buf), NULL); + &snapid, sizeof (snapid), + &size_buf, sizeof (size_buf), NULL); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -3612,8 +3612,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) return -ENOMEM; ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, - "rbd", "get_object_prefix", - NULL, 0, + "rbd", "get_object_prefix", NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) @@ -3644,15 +3643,14 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, struct { __le64 features; __le64 incompat; - } features_buf = { 0 }; + } __attribute__ ((packed)) features_buf = { 0 }; u64 incompat; int ret; ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_features", - (char *) &snapid, sizeof (snapid), - (char *) &features_buf, sizeof (features_buf), - NULL); + &snapid, sizeof (snapid), + &features_buf, sizeof (features_buf), NULL); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -3706,15 +3704,15 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) snapid = cpu_to_le64(CEPH_NOSNAP); ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_parent", - (char *) &snapid, sizeof (snapid), - (char *) reply_buf, size, NULL); + &snapid, sizeof (snapid), + reply_buf, size, NULL); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out_err; ret = -ERANGE; p = reply_buf; - end = (char *) reply_buf + size; + end = reply_buf + size; ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); if (parent_spec->pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ @@ -3767,7 +3765,7 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) return NULL; p = image_id; - end = (char *) image_id + image_id_size; + end = image_id + image_id_size; ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; @@ -3778,11 +3776,11 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, "rbd", "dir_get_name", image_id, image_id_size, - (char *) reply_buf, size, NULL); + reply_buf, size, NULL); if (ret < 0) goto out; p = reply_buf; - end = (char *) reply_buf + size; + end = reply_buf + size; image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); if (IS_ERR(image_name)) image_name = NULL; @@ -3831,7 +3829,7 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) name = rbd_dev_image_name(rbd_dev); if (name) - rbd_dev->spec->image_name = (char *) name; + rbd_dev->spec->image_name = (char *)name; else rbd_warn(rbd_dev, "unable to get image name"); @@ -3882,8 +3880,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) return -ENOMEM; ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, - "rbd", "get_snapcontext", - NULL, 0, + "rbd", "get_snapcontext", NULL, 0, reply_buf, size, ver); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) @@ -3891,7 +3888,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) ret = -ERANGE; p = reply_buf; - end = (char *) reply_buf + size; + end = reply_buf + size; ceph_decode_64_safe(&p, end, seq, out); ceph_decode_32_safe(&p, end, snap_count, out); @@ -3952,14 +3949,14 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapshot_name", - (char *) &snap_id, sizeof (snap_id), + &snap_id, sizeof (snap_id), reply_buf, size, NULL); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; p = reply_buf; - end = (char *) reply_buf + size; + end = reply_buf + size; snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(snap_name)) { ret = PTR_ERR(snap_name); @@ -4555,8 +4552,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) } ret = rbd_obj_method_sync(rbd_dev, object_name, - "rbd", "get_id", - NULL, 0, + "rbd", "get_id", NULL, 0, response, RBD_IMAGE_ID_LEN_MAX, NULL); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) -- cgit v1.2.3 From 57385b51c3ffd0fed2dd9d5d8e4ec080c85ecbcd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 21 Apr 2013 12:14:45 -0500 Subject: rbd: have rbd_obj_method_sync() return transfer count Callers of rbd_obj_method_sync() don't know how many bytes of data got returned by the class method call. As a result, they have been assuming enough got returned to decode whatever was expected. This isn't safe. We know how many bytes got transferred, so have rbd_obj_method_sync() return that amount (rather than just 0) if the call is successful. Change all callers to use this return value to ensure decoding of the results is done safely. On the other hand, most callers of rbd_obj_method_sync() only indicate success or failure, so all of *their* callers can simply test for non-zero result. This resolves: http://tracker.ceph.com/issues/4773 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 60 +++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 27 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 91b4b741efda..44dcc82770d9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2642,7 +2642,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, * method. Currently if this is present it will be a * snapshot id. */ - page_count = (u32) calc_pages_for(0, inbound_size); + page_count = (u32)calc_pages_for(0, inbound_size); pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); @@ -2689,7 +2689,9 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, ret = obj_request->result; if (ret < 0) goto out; - ret = 0; + + rbd_assert(obj_request->xferred < (u64)INT_MAX); + ret = (int)obj_request->xferred; ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); if (version) *version = obj_request->version; @@ -3583,13 +3585,15 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; + if (ret < sizeof (size_buf)) + return -ERANGE; *order = size_buf.order; *snap_size = le64_to_cpu(size_buf.size); dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", - (unsigned long long) snap_id, (unsigned int) *order, - (unsigned long long) *snap_size); + (unsigned long long)snap_id, (unsigned int)*order, + (unsigned long long)*snap_size); return 0; } @@ -3620,8 +3624,8 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) p = reply_buf; rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, - p + RBD_OBJ_PREFIX_LEN_MAX, - NULL, GFP_NOIO); + p + ret, NULL, GFP_NOIO); + ret = 0; if (IS_ERR(rbd_dev->header.object_prefix)) { ret = PTR_ERR(rbd_dev->header.object_prefix); @@ -3629,7 +3633,6 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) } else { dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); } - out: kfree(reply_buf); @@ -3654,6 +3657,8 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; + if (ret < sizeof (features_buf)) + return -ERANGE; incompat = le64_to_cpu(features_buf.incompat); if (incompat & ~RBD_FEATURES_SUPPORTED) @@ -3662,9 +3667,9 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, *snap_features = le64_to_cpu(features_buf.features); dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", - (unsigned long long) snap_id, - (unsigned long long) *snap_features, - (unsigned long long) le64_to_cpu(features_buf.incompat)); + (unsigned long long)snap_id, + (unsigned long long)*snap_features, + (unsigned long long)le64_to_cpu(features_buf.incompat)); return 0; } @@ -3710,9 +3715,9 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) if (ret < 0) goto out_err; - ret = -ERANGE; p = reply_buf; - end = reply_buf + size; + end = reply_buf + ret; + ret = -ERANGE; ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err); if (parent_spec->pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ @@ -3720,8 +3725,8 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) /* The ceph file layout needs to fit pool id in 32 bits */ ret = -EIO; - if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) - goto out; + if (WARN_ON(parent_spec->pool_id > (u64)U32_MAX)) + goto out_err; image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { @@ -3766,7 +3771,7 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) p = image_id; end = image_id + image_id_size; - ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); + ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; reply_buf = kmalloc(size, GFP_KERNEL); @@ -3886,9 +3891,9 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) if (ret < 0) goto out; - ret = -ERANGE; p = reply_buf; - end = reply_buf + size; + end = reply_buf + ret; + ret = -ERANGE; ceph_decode_64_safe(&p, end, seq, out); ceph_decode_32_safe(&p, end, snap_count, out); @@ -3913,6 +3918,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) ret = -ENOMEM; goto out; } + ret = 0; atomic_set(&snapc->nref, 1); snapc->seq = seq; @@ -3923,12 +3929,11 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) rbd_dev->header.snapc = snapc; dout(" snap context seq = %llu, snap_count = %u\n", - (unsigned long long) seq, (unsigned int) snap_count); - + (unsigned long long)seq, (unsigned int)snap_count); out: kfree(reply_buf); - return 0; + return ret; } static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) @@ -3963,7 +3968,7 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) goto out; } else { dout(" snap_id 0x%016llx snap_name = %s\n", - (unsigned long long) le64_to_cpu(snap_id), snap_name); + (unsigned long long)le64_to_cpu(snap_id), snap_name); } kfree(reply_buf); @@ -4560,8 +4565,10 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) p = response; rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, - p + RBD_IMAGE_ID_LEN_MAX, + p + ret, NULL, GFP_NOIO); + ret = 0; + if (IS_ERR(rbd_dev->spec->image_id)) { ret = PTR_ERR(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; @@ -4642,28 +4649,27 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) RBD_HEADER_PREFIX, rbd_dev->spec->image_id); /* Get the size and object order for the image */ - ret = rbd_dev_v2_image_size(rbd_dev); - if (ret < 0) + if (ret) goto out_err; /* Get the object prefix (a.k.a. block_name) for the image */ ret = rbd_dev_v2_object_prefix(rbd_dev); - if (ret < 0) + if (ret) goto out_err; /* Get the and check features for the image */ ret = rbd_dev_v2_features(rbd_dev); - if (ret < 0) + if (ret) goto out_err; /* If the image supports layering, get the parent info */ if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { ret = rbd_dev_v2_parent_info(rbd_dev); - if (ret < 0) + if (ret) goto out_err; } -- cgit v1.2.3 From cc070d59bc422945f83a89e9d60f749d0f82787d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 21 Apr 2013 12:14:45 -0500 Subject: rbd: get and check striping parameters If an rbd format 2 image indicates it supports the STRIPINGV2 feature we need to find out its stripe unit and stripe count in order to know whether we can use it. We don't yet support fancy striping fully, but if the default parameters are used the behavior is indistinguishible from non-fancy striping. This is necessary because some images require the STRIPINGV2 feature even if they use the default parameters. (Which is to say the feature bit was erroneously set even if the feature was not used.) This resolves: http://tracker.ceph.com/issues/4709 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 44dcc82770d9..c6a3f46bc8d5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -317,6 +317,9 @@ struct rbd_device { u64 parent_overlap; struct rbd_device *parent; + u64 stripe_unit; + u64 stripe_count; + /* protects updating the header */ struct rw_semaphore header_rwsem; @@ -3749,6 +3752,56 @@ out_err: return ret; } +static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) +{ + struct { + __le64 stripe_unit; + __le64 stripe_count; + } __attribute__ ((packed)) striping_info_buf = { 0 }; + size_t size = sizeof (striping_info_buf); + void *p; + u64 obj_size; + u64 stripe_unit; + u64 stripe_count; + int ret; + + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + "rbd", "get_stripe_unit_count", NULL, 0, + (char *)&striping_info_buf, size, NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); + if (ret < 0) + return ret; + if (ret < size) + return -ERANGE; + + /* + * We don't actually support the "fancy striping" feature + * (STRIPINGV2) yet, but if the striping sizes are the + * defaults the behavior is the same as before. So find + * out, and only fail if the image has non-default values. + */ + ret = -EINVAL; + obj_size = (u64)1 << rbd_dev->header.obj_order; + p = &striping_info_buf; + stripe_unit = ceph_decode_64(&p); + if (stripe_unit != obj_size) { + rbd_warn(rbd_dev, "unsupported stripe unit " + "(got %llu want %llu)", + stripe_unit, obj_size); + return -EINVAL; + } + stripe_count = ceph_decode_64(&p); + if (stripe_count != 1) { + rbd_warn(rbd_dev, "unsupported stripe count " + "(got %llu want 1)", stripe_count); + return -EINVAL; + } + rbd_dev->stripe_unit = stripe_unit; + rbd_dev->stripe_count = stripe_count; + + return 0; +} + static char *rbd_dev_image_name(struct rbd_device *rbd_dev) { size_t image_id_size; @@ -4673,6 +4726,14 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) goto out_err; } + /* If the image supports fancy striping, get its parameters */ + + if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { + ret = rbd_dev_v2_striping_info(rbd_dev); + if (ret < 0) + goto out_err; + } + /* crypto and compression type aren't (yet) supported for v2 images */ rbd_dev->header.crypt_type = 0; -- cgit v1.2.3 From 770eba6e295fd36e43881176ee0644b9cc2803f1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Oct 2012 23:34:40 -0500 Subject: rbd: activate support for layered images Now that we have most everything in place to support layered rbd images, enable support for them in the kernel client. Issue a warning to the log that the support is considered experimental whenever a format 2 layered image is mapped. Note that we also have to claim to support the STRIPINGV2 feature, due to a mistake in the way the rbd CLI set up those flags. This feature can work if it has the right parameters, and safeguards have been put in place to reject those images that do not have compatible parameters. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c6a3f46bc8d5..4d99d40034e1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -80,7 +80,7 @@ /* Features supported by this (client software) implementation. */ -#define RBD_FEATURES_SUPPORTED (0) +#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) /* * An RBD device name will be "rbd#", where the "rbd" comes from @@ -4724,6 +4724,8 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) ret = rbd_dev_v2_parent_info(rbd_dev); if (ret) goto out_err; + rbd_warn(rbd_dev, "WARNING: kernel support for " + "layered rbd images is EXPERIMENTAL!"); } /* If the image supports fancy striping, get its parameters */ -- cgit v1.2.3 From 3e83b65bb9a9f3a4d7f0200139bd947c940ec3ab Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 23 Apr 2013 13:52:53 -0500 Subject: rbd: don't create sysfs entries for non-mapped snapshots When an rbd image gets mapped a device entry gets created for it under /sys/bus/rbd/devices//. Inside that directory there are sysfs files that contain information about the image: its size, feature bits, major device number, and so on. Additionally, if that image has any snapshots, a device entry gets created for each of those as a "child" of the mapped device. Each of these is a subdirectory of the mapped device, and each directory contains a few files with information about the snapshot (its snapshot id, size, and feature mask). There is no clear benefit to having those device entries for the snapshots. The information provided via sysfs of of little real value--and all of it is available via rbd CLI commands. If we still wanted to see the kernel's view of this information it could be done much more simply by including it in a single sysfs file for the mapped image. But there *is* a clear cost to supporting them. Every time a snapshot context changes, these entries need to be updated (deleted snapshots removed, new snapshots created). The rbd driver is notified of changes to the snapshot context via callbacks from an osd, and care must be taken to coordinate removal of snapshot data structures with the possibility of one these notifications occurring. Things would be considerably simpler if we just didn't have to maintain device entries for the snapshots. So get rid of them. The ability to map a snapshot of an rbd image will remain; the only thing lost will be the ability to query these sysfs directories for information about snapshots of mapped images. This resolves: http://tracker.ceph.com/issues/4796 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- Documentation/ABI/testing/sysfs-bus-rbd | 20 ----- drivers/block/rbd.c | 137 +------------------------------- 2 files changed, 4 insertions(+), 153 deletions(-) (limited to 'drivers') diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd index cd9213ccf3dc..0a306476424e 100644 --- a/Documentation/ABI/testing/sysfs-bus-rbd +++ b/Documentation/ABI/testing/sysfs-bus-rbd @@ -66,27 +66,7 @@ current_snap The current snapshot for which the device is mapped. -snap_* - - A directory per each snapshot - parent Information identifying the pool, image, and snapshot id for the parent image in a layered rbd image (format 2 only). - -Entries under /sys/bus/rbd/devices//snap_ -------------------------------------------------------------- - -snap_id - - The rados internal snapshot id assigned for this snapshot - -snap_size - - The size of the image when this snapshot was taken. - -snap_features - - A hexadecimal encoding of the feature bits for this snapshot. - diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4d99d40034e1..515fbf967ef3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -272,7 +272,6 @@ struct rbd_img_request { list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) struct rbd_snap { - struct device dev; const char *name; u64 size; struct list_head node; @@ -358,7 +357,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock); static int rbd_img_request_submit(struct rbd_img_request *img_request); static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); -static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); static void rbd_dev_release(struct device *dev); static void rbd_remove_snap_dev(struct rbd_snap *snap); @@ -3069,8 +3067,6 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) kfree(h.object_prefix); ret = rbd_dev_snaps_update(rbd_dev); - if (!ret) - ret = rbd_dev_snaps_register(rbd_dev); up_write(&rbd_dev->header_rwsem); @@ -3344,71 +3340,6 @@ static struct device_type rbd_device_type = { .release = rbd_sysfs_dev_release, }; - -/* - sysfs - snapshots -*/ - -static ssize_t rbd_snap_size_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - - return sprintf(buf, "%llu\n", (unsigned long long)snap->size); -} - -static ssize_t rbd_snap_id_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - - return sprintf(buf, "%llu\n", (unsigned long long)snap->id); -} - -static ssize_t rbd_snap_features_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - - return sprintf(buf, "0x%016llx\n", - (unsigned long long) snap->features); -} - -static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); -static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); -static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); - -static struct attribute *rbd_snap_attrs[] = { - &dev_attr_snap_size.attr, - &dev_attr_snap_id.attr, - &dev_attr_snap_features.attr, - NULL, -}; - -static struct attribute_group rbd_snap_attr_group = { - .attrs = rbd_snap_attrs, -}; - -static void rbd_snap_dev_release(struct device *dev) -{ - struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); - kfree(snap->name); - kfree(snap); -} - -static const struct attribute_group *rbd_snap_attr_groups[] = { - &rbd_snap_attr_group, - NULL -}; - -static struct device_type rbd_snap_device_type = { - .groups = rbd_snap_attr_groups, - .release = rbd_snap_dev_release, -}; - static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) { kref_get(&spec->kref); @@ -3483,38 +3414,11 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev) kfree(rbd_dev); } -static bool rbd_snap_registered(struct rbd_snap *snap) -{ - bool ret = snap->dev.type == &rbd_snap_device_type; - bool reg = device_is_registered(&snap->dev); - - rbd_assert(!ret ^ reg); - - return ret; -} - static void rbd_remove_snap_dev(struct rbd_snap *snap) { list_del(&snap->node); - if (device_is_registered(&snap->dev)) - device_unregister(&snap->dev); -} - -static int rbd_register_snap_dev(struct rbd_snap *snap, - struct device *parent) -{ - struct device *dev = &snap->dev; - int ret; - - dev->type = &rbd_snap_device_type; - dev->parent = parent; - dev->release = rbd_snap_dev_release; - dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name); - dout("%s: registering device for snapshot %s\n", __func__, snap->name); - - ret = device_register(dev); - - return ret; + kfree(snap->name); + kfree(snap); } static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, @@ -4089,8 +3993,6 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) dout("rbd_dev_snaps_update returned %d\n", ret); if (ret) goto out; - ret = rbd_dev_snaps_register(rbd_dev); - dout("rbd_dev_snaps_register returned %d\n", ret); out: up_write(&rbd_dev->header_rwsem); @@ -4145,11 +4047,11 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) */ if (rbd_dev->spec->snap_id == snap->id) clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); - rbd_remove_snap_dev(snap); - dout("%ssnap id %llu has been removed\n", + dout("removing %ssnap id %llu\n", rbd_dev->spec->snap_id == snap->id ? "mapped " : "", (unsigned long long) snap->id); + rbd_remove_snap_dev(snap); /* Done with this list entry; advance */ @@ -4209,31 +4111,6 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) return 0; } -/* - * Scan the list of snapshots and register the devices for any that - * have not already been registered. - */ -static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) -{ - struct rbd_snap *snap; - int ret = 0; - - dout("%s:\n", __func__); - if (WARN_ON(!device_is_registered(&rbd_dev->dev))) - return -EIO; - - list_for_each_entry(snap, &rbd_dev->snaps, node) { - if (!rbd_snap_registered(snap)) { - ret = rbd_register_snap_dev(snap, &rbd_dev->dev); - if (ret < 0) - break; - } - } - dout("%s: returning %d\n", __func__, ret); - - return ret; -} - static int rbd_bus_add_dev(struct rbd_device *rbd_dev) { struct device *dev; @@ -4840,12 +4717,6 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) rbd_dev->parent = parent; } - down_write(&rbd_dev->header_rwsem); - ret = rbd_dev_snaps_register(rbd_dev); - up_write(&rbd_dev->header_rwsem); - if (ret) - goto err_out_bus; - ret = rbd_dev_header_watch_sync(rbd_dev, 1); if (ret) goto err_out_bus; -- cgit v1.2.3 From 522a0cc0f0ecdb1857db7795b1c17591f28f9ca0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Apr 2013 15:09:41 -0500 Subject: rbd: fix leak of snapshots during initial probe When an rbd image is initially mapped, its snapshot context is collected, and then a list of snapshot entries representing the snapshots in that context is created. The list is created using rbd_dev_snaps_update(). (This function also supports updating an existing snapshot list based on a new snapshot context.) If an error occurs, updating the list is aborted, and the list is currently left as-is, in an inconsistent state. At that point, there may be a partially-constructed list, but the calling functions (rbd_dev_probe_finish() from rbd_dev_probe() from rbd_add()) never clean them up. So this constitutes a leak. A snapshot list that is inconsistent with the current snapshot context is of no use, and might even be actively bad. So rather than just having the caller clean it up, have rbd_dev_snaps_update() just clear out the entire snapshot list in the event an error occurs. The other place rbd_dev_snaps_update() is used is when a refresh is triggered, either because of a watch callback or via a write to the /sys/bus/rbd/devices//refresh interface. An error while updating the snapshots has no substantive effect in either of those cases, but one of them issues a warning. Move that warning to the common rbd_dev_refresh() function so it gets issued regardless of how it got initiated. This is part of: http://tracker.ceph.com/issues/4803 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 50 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 515fbf967ef3..28b652c38102 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2521,7 +2521,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { struct rbd_device *rbd_dev = (struct rbd_device *)data; u64 hver; - int rc; if (!rbd_dev) return; @@ -2529,10 +2528,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, rbd_dev->header_name, (unsigned long long) notify_id, (unsigned int) opcode); - rc = rbd_dev_refresh(rbd_dev, &hver); - if (rc) - rbd_warn(rbd_dev, "got notification but failed to " - " update snaps: %d\n", rc); + (void)rbd_dev_refresh(rbd_dev, &hver); rbd_obj_notify_ack(rbd_dev, hver, notify_id); } @@ -3085,6 +3081,9 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) ret = rbd_dev_v2_refresh(rbd_dev, hver); mutex_unlock(&ctl_mutex); revalidate_disk(rbd_dev->disk); + if (ret) + rbd_warn(rbd_dev, "got notification but failed to " + " update snaps: %d\n", ret); return ret; } @@ -4010,6 +4009,11 @@ out: * Assumes the snapshots in the snapshot context are sorted by * snapshot id, highest id first. (Snapshots in the rbd_dev's list * are also maintained in that order.) + * + * Note that any error occurs while updating the snapshot list + * aborts the update, and the entire list is cleared. The snapshot + * list becomes inconsistent at that point anyway, so it might as + * well be empty. */ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) { @@ -4018,8 +4022,9 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) struct list_head *head = &rbd_dev->snaps; struct list_head *links = head->next; u32 index = 0; + int ret = 0; - dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); + dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count); while (index < snap_count || links != head) { u64 snap_id; struct rbd_snap *snap; @@ -4040,17 +4045,17 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) * A previously-existing snapshot is not in * the new snap context. * - * If the now missing snapshot is the one the - * image is mapped to, clear its exists flag - * so we can avoid sending any more requests - * to it. + * If the now-missing snapshot is the one + * the image represents, clear its existence + * flag so we can avoid sending any more + * requests to it. */ if (rbd_dev->spec->snap_id == snap->id) clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); dout("removing %ssnap id %llu\n", rbd_dev->spec->snap_id == snap->id ? "mapped " : "", - (unsigned long long) snap->id); + (unsigned long long)snap->id); rbd_remove_snap_dev(snap); /* Done with this list entry; advance */ @@ -4061,11 +4066,14 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) snap_name = rbd_dev_snap_info(rbd_dev, index, &snap_size, &snap_features); - if (IS_ERR(snap_name)) - return PTR_ERR(snap_name); + if (IS_ERR(snap_name)) { + ret = PTR_ERR(snap_name); + dout("failed to get snap info, error %d\n", ret); + goto out_err; + } - dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, - (unsigned long long) snap_id); + dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count, + (unsigned long long)snap_id); if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { struct rbd_snap *new_snap; @@ -4074,11 +4082,9 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, snap_id, snap_size, snap_features); if (IS_ERR(new_snap)) { - int err = PTR_ERR(new_snap); - - dout(" failed to add dev, error %d\n", err); - - return err; + ret = PTR_ERR(new_snap); + dout(" failed to add dev, error %d\n", ret); + goto out_err; } /* New goes before existing, or at end of list */ @@ -4109,6 +4115,10 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) dout("%s: done\n", __func__); return 0; +out_err: + rbd_remove_all_snaps(rbd_dev); + + return ret; } static int rbd_bus_add_dev(struct rbd_device *rbd_dev) -- cgit v1.2.3 From c86f86e9e75e77e4d51ded9edbad30834ff606f7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Apr 2013 15:09:41 -0500 Subject: rbd: make snap_size order parameter optional Only one of the two callers of _rbd_dev_v2_snap_size() needs the order value returned. So make that an optional argument--a null pointer if the caller doesn't need it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 28b652c38102..1e01f0d8312a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3494,7 +3494,8 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, if (ret < sizeof (size_buf)) return -ERANGE; - *order = size_buf.order; + if (order) + *order = size_buf.order; *snap_size = le64_to_cpu(size_buf.size); dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", @@ -3939,11 +3940,10 @@ static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, u64 *snap_size, u64 *snap_features) { u64 snap_id; - u8 order; int ret; snap_id = rbd_dev->header.snapc->snaps[which]; - ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); + ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, snap_size); if (ret) return ERR_PTR(ret); ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); -- cgit v1.2.3 From acb1b6caf179d405ebd1dddefe916ccbb9b90298 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Apr 2013 15:09:41 -0500 Subject: rbd: only update values on snap_info success Change rbd_dev_v2_snap_info() so it only ever sets values of the size and features parameters if looking up the snapshot name was successful. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1e01f0d8312a..e7d10d384f07 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3908,6 +3908,7 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) if (!reply_buf) return ERR_PTR(-ENOMEM); + rbd_assert(which < rbd_dev->header.snapc->num_snaps); snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapshot_name", @@ -3940,17 +3941,30 @@ static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, u64 *snap_size, u64 *snap_features) { u64 snap_id; + u64 size; + u64 features; + char *snap_name; int ret; + rbd_assert(which < rbd_dev->header.snapc->num_snaps); snap_id = rbd_dev->header.snapc->snaps[which]; - ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, snap_size); + ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); if (ret) - return ERR_PTR(ret); - ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); + goto out_err; + + ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); if (ret) - return ERR_PTR(ret); + goto out_err; + + snap_name = rbd_dev_v2_snap_name(rbd_dev, which); + if (!IS_ERR(snap_name)) { + *snap_size = size; + *snap_features = features; + } - return rbd_dev_v2_snap_name(rbd_dev, which); + return snap_name; +out_err: + return ERR_PTR(ret); } static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, -- cgit v1.2.3 From 6087b51b9e7b311353408945bcc48368a54b8bbc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Apr 2013 15:09:41 -0500 Subject: rbd: rename __rbd_add_snap_dev() Rename __rbd_add_snap_dev() to be rbd_snap_create(). We no longer have devices for non-mapped snapshots, and we're not actually "adding" it to the list in this function, just creating it. Rename rbd_remove_snap_dev() to be rbd_snap_destroy() for reasons similar to the above. Stop having this function delete the snapshot from its list (to be symmetrical with its create counterpart) and do that in the caller instead. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e7d10d384f07..916741b09aaa 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -359,7 +359,7 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request); static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); static void rbd_dev_release(struct device *dev); -static void rbd_remove_snap_dev(struct rbd_snap *snap); +static void rbd_snap_destroy(struct rbd_snap *snap); static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); @@ -3010,8 +3010,10 @@ static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) struct rbd_snap *snap; struct rbd_snap *next; - list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) - rbd_remove_snap_dev(snap); + list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) { + list_del(&snap->node); + rbd_snap_destroy(snap); + } } static void rbd_update_mapping_size(struct rbd_device *rbd_dev) @@ -3413,14 +3415,13 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev) kfree(rbd_dev); } -static void rbd_remove_snap_dev(struct rbd_snap *snap) +static void rbd_snap_destroy(struct rbd_snap *snap) { - list_del(&snap->node); kfree(snap->name); kfree(snap); } -static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, +static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev, const char *snap_name, u64 snap_id, u64 snap_size, u64 snap_features) @@ -4070,7 +4071,9 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) rbd_dev->spec->snap_id == snap->id ? "mapped " : "", (unsigned long long)snap->id); - rbd_remove_snap_dev(snap); + + list_del(&snap->node); + rbd_snap_destroy(snap); /* Done with this list entry; advance */ @@ -4093,7 +4096,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) /* We haven't seen this snapshot before */ - new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, + new_snap = rbd_snap_create(rbd_dev, snap_name, snap_id, snap_size, snap_features); if (IS_ERR(new_snap)) { ret = PTR_ERR(new_snap); -- cgit v1.2.3 From 6e584f5244060edc77141700d814a2af7d697685 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Apr 2013 15:09:42 -0500 Subject: rbd: fix leak of format 2 snapshot names When the snapshot context for an rbd device gets updated (or the initial one is recorded) a a list of snapshot structures is created to represent them, one entry per snapshot. Each entry includes a dynamically-allocated copy of the snapshot name. Currently the name is allocated in rbd_snap_create(), as a duplicate of the passed-in name. For format 1 images, the snapshot name provided is just a pointer to an existing name. But for format 2 images, the passed-in name is already dynamically allocated, and in the the process of duplicating it here we are leaking the passed-in name. Fix this by dynamically allocating the name for format 1 snapshots also, and then stop allocating a duplicate in rbd_snap_create(). Change rbd_dev_v1_snap_info() so none of its parameters is side-effected unless it's going to return success. This is part of: http://tracker.ceph.com/issues/4803 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 916741b09aaa..c15bb3f5ebfb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3427,46 +3427,44 @@ static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev, u64 snap_features) { struct rbd_snap *snap; - int ret; snap = kzalloc(sizeof (*snap), GFP_KERNEL); if (!snap) return ERR_PTR(-ENOMEM); - ret = -ENOMEM; - snap->name = kstrdup(snap_name, GFP_KERNEL); - if (!snap->name) - goto err; - + snap->name = snap_name; snap->id = snap_id; snap->size = snap_size; snap->features = snap_features; return snap; - -err: - kfree(snap->name); - kfree(snap); - - return ERR_PTR(ret); } +/* + * Returns a dynamically-allocated snapshot name if successful, or a + * pointer-coded error otherwise. + */ static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, u64 *snap_size, u64 *snap_features) { char *snap_name; + int i; rbd_assert(which < rbd_dev->header.snapc->num_snaps); - *snap_size = rbd_dev->header.snap_sizes[which]; - *snap_features = 0; /* No features for v1 */ - /* Skip over names until we find the one we are looking for */ snap_name = rbd_dev->header.snap_names; - while (which--) + for (i = 0; i < which; i++) snap_name += strlen(snap_name) + 1; + snap_name = kstrdup(snap_name, GFP_KERNEL); + if (!snap_name) + return ERR_PTR(-ENOMEM); + + *snap_size = rbd_dev->header.snap_sizes[which]; + *snap_features = 0; /* No features for v1 */ + return snap_name; } -- cgit v1.2.3 From f40eb349e032bee2b6f06e9b6f1dbfae561bd30a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Apr 2013 15:09:42 -0500 Subject: rbd: use rbd_obj_method_sync() return value Now that rbd_obj_method_sync() returns the number of bytes returned by the method call, that value should be used by callers to ensure we don't overrun the valid portion of the buffer. Fix the two spots that remained that weren't doing that, rbd_dev_image_name() and rbd_dev_v2_snap_name(). Rearrange the error path slightly in rbd_dev_v2_snap_name(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c15bb3f5ebfb..21e84a15ae4c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2614,7 +2614,8 @@ out_cancel: } /* - * Synchronous osd object method call + * Synchronous osd object method call. Returns the number of bytes + * returned in the outbound buffer, or a negative error code. */ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, const char *object_name, @@ -3741,7 +3742,8 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) if (ret < 0) goto out; p = reply_buf; - end = reply_buf + size; + end = reply_buf + ret; + image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); if (IS_ERR(image_name)) image_name = NULL; @@ -3914,26 +3916,23 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) &snap_id, sizeof (snap_id), reply_buf, size, NULL); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); - if (ret < 0) + if (ret < 0) { + snap_name = ERR_PTR(ret); goto out; + } p = reply_buf; - end = reply_buf + size; + end = reply_buf + ret; snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); - if (IS_ERR(snap_name)) { - ret = PTR_ERR(snap_name); + if (IS_ERR(snap_name)) goto out; - } else { - dout(" snap_id 0x%016llx snap_name = %s\n", - (unsigned long long)le64_to_cpu(snap_id), snap_name); - } - kfree(reply_buf); - return snap_name; + dout(" snap_id 0x%016llx snap_name = %s\n", + (unsigned long long)le64_to_cpu(snap_id), snap_name); out: kfree(reply_buf); - return ERR_PTR(ret); + return snap_name; } static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, -- cgit v1.2.3 From a0cab924324fac8d6414009bc25ce31eeece038e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Apr 2013 23:15:08 -0500 Subject: rbd: avoid dropping extra reference in rbd_free_disk() I found during some failure injection testing that the call to rbd_free_disk() in the error path of rbd_dev_probe_finish() was dropping an extra reference to the disk queue. The problem occurred when put_disk tried to drop a reference to the disk's queue. A call to blk_cleanup_queue() just prior to that will have also dropped a reference to the queue. The problem is that the reference dropped by put_disk() is assumed to have been taken by add_disk(). Our code has error paths that can occur after the disk and its queue are initialized, but before the call to add_disk(), and in those paths we won't have that extra reference. The fix is easy though. In rbd_free_disk() we're already checking the disk's GENHD_FL_UP flag. That flag is an indication that add_disk() has been called, so just call blk_cleanup_queue() conditional on that flag being set. This resolves: http://tracker.ceph.com/issues/4800 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 21e84a15ae4c..1704a3b1e4cb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2844,10 +2844,12 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) if (!disk) return; - if (disk->flags & GENHD_FL_UP) + rbd_dev->disk = NULL; + if (disk->flags & GENHD_FL_UP) { del_gendisk(disk); - if (disk->queue) - blk_cleanup_queue(disk->queue); + if (disk->queue) + blk_cleanup_queue(disk->queue); + } put_disk(disk); } -- cgit v1.2.3 From c0fba36880288afbeca872298c970fb4abb76464 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Apr 2013 23:15:08 -0500 Subject: rbd: have rbd_dev_image_id() set format 1 image id Currently, rbd_dev_probe() assumes that any error returned by rbd_dev_image_id() is most likely -ENOENT, and responds by calling the format 1 probe routine, rbd_dev_v1_probe(). Then, at the top of rbd_dev_v1_probe(), an empty string is allocated for the image id. This is sort of unbalanced. Fix this by having rbd_dev_image_id() look for -ENOENT from its "get_id" method call. If that is seen, have it allocate the empty string there rather than depending on rbd_dev_v1_probe() to do it. Given that this is effectively defining the format of the image, set rbd_dev->image_format inside rbd_dev_image_id() rather than in the format-specific probe routines. Also drop a redundant hunk of code in rbd_dev_image_id(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 62 +++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 30 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1704a3b1e4cb..0ddcbe584a1f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4477,20 +4477,19 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) size_t size; char *object_name; void *response; - void *p; - - /* If we already have it we don't need to look it up */ - - if (rbd_dev->spec->image_id) - return 0; + char *image_id; /* * When probing a parent image, the image id is already * known (and the image name likely is not). There's no - * need to fetch the image id again in this case. + * need to fetch the image id again in this case. We + * do still need to set the image format though. */ - if (rbd_dev->spec->image_id) + if (rbd_dev->spec->image_id) { + rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; + return 0; + } /* * First, see if the format 2 image id file exists, and if @@ -4512,24 +4511,32 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) goto out; } + /* If it doesn't exist we'll assume it's a format 1 image */ + ret = rbd_obj_method_sync(rbd_dev, object_name, "rbd", "get_id", NULL, 0, response, RBD_IMAGE_ID_LEN_MAX, NULL); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); - if (ret < 0) - goto out; - - p = response; - rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, - p + ret, + if (ret == -ENOENT) { + image_id = kstrdup("", GFP_KERNEL); + ret = image_id ? 0 : -ENOMEM; + if (!ret) + rbd_dev->image_format = 1; + } else if (ret > sizeof (__le32)) { + void *p = response; + + image_id = ceph_extract_encoded_string(&p, p + ret, NULL, GFP_NOIO); - ret = 0; - - if (IS_ERR(rbd_dev->spec->image_id)) { - ret = PTR_ERR(rbd_dev->spec->image_id); - rbd_dev->spec->image_id = NULL; + ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0; + if (!ret) + rbd_dev->image_format = 2; } else { - dout("image_id is %s\n", rbd_dev->spec->image_id); + ret = -EINVAL; + } + + if (!ret) { + rbd_dev->spec->image_id = image_id; + dout("image_id is %s\n", image_id); } out: kfree(response); @@ -4543,12 +4550,6 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) int ret; size_t size; - /* Version 1 images have no id; empty string is used */ - - rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); - if (!rbd_dev->spec->image_id) - return -ENOMEM; - /* Record the header object name for this rbd image. */ size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); @@ -4571,8 +4572,6 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) rbd_dev->parent_spec = NULL; rbd_dev->parent_overlap = 0; - rbd_dev->image_format = 1; - dout("discovered version 1 image, header name is %s\n", rbd_dev->header_name); @@ -4651,8 +4650,6 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) goto out_err; rbd_dev->header.obj_version = ver; - rbd_dev->image_format = 2; - dout("discovered version 2 image, header name is %s\n", rbd_dev->header_name); @@ -4795,6 +4792,11 @@ static int rbd_dev_probe(struct rbd_device *rbd_dev) */ ret = rbd_dev_image_id(rbd_dev); if (ret) + return ret; + rbd_assert(rbd_dev->spec->image_id); + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + + if (rbd_dev->image_format == 1) ret = rbd_dev_v1_probe(rbd_dev); else ret = rbd_dev_v2_probe(rbd_dev); -- cgit v1.2.3 From 5655c4d940ba8dd32250ab1e4ba3db785943a28e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Apr 2013 23:15:08 -0500 Subject: rbd: fix image id leak in initial probe If a format 2 image id is found for an image being mapped, but the subsequent probe of the image fails, rbd_dev_probe() quits without freeing the image id. Fix that. Also drop a redundant hunk of code in rbd_dev_image_id(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0ddcbe584a1f..815c174661a8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4800,16 +4800,20 @@ static int rbd_dev_probe(struct rbd_device *rbd_dev) ret = rbd_dev_v1_probe(rbd_dev); else ret = rbd_dev_v2_probe(rbd_dev); - if (ret) { - dout("probe failed, returning %d\n", ret); - - return ret; - } + if (ret) + goto out_err; ret = rbd_dev_probe_finish(rbd_dev); if (ret) rbd_header_free(&rbd_dev->header); + return ret; +out_err: + kfree(rbd_dev->spec->image_id); + rbd_dev->spec->image_id = NULL; + + dout("probe failed, returning %d\n", ret); + return ret; } -- cgit v1.2.3 From 8b0241f85ab11c87075f9de0191acd8b546c6f6a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Apr 2013 23:15:08 -0500 Subject: rbd: have snap_by_name() return a snapshot A function called snap_by_name() ought to just look up a snapshot by name. It does that, but then it assigns some stuff to the rbd device structure as well. Change the function to do just the lookup, and have the caller do the assignments that follow. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 815c174661a8..6b1e9a9f2f72 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -830,44 +830,39 @@ static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) return NULL; } -static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) +static struct rbd_snap *snap_by_name(struct rbd_device *rbd_dev, + const char *snap_name) { - struct rbd_snap *snap; - list_for_each_entry(snap, &rbd_dev->snaps, node) { - if (!strcmp(snap_name, snap->name)) { - rbd_dev->spec->snap_id = snap->id; - rbd_dev->mapping.size = snap->size; - rbd_dev->mapping.features = snap->features; - - return 0; - } - } + list_for_each_entry(snap, &rbd_dev->snaps, node) + if (!strcmp(snap_name, snap->name)) + return snap; - return -ENOENT; + return NULL; } static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) { - int ret; - if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { rbd_dev->spec->snap_id = CEPH_NOSNAP; rbd_dev->mapping.size = rbd_dev->header.image_size; rbd_dev->mapping.features = rbd_dev->header.features; - ret = 0; } else { - ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); - if (ret < 0) - goto done; + struct rbd_snap *snap; + + snap = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); + if (!snap) + return -ENOENT; + rbd_dev->spec->snap_id = snap->id; + rbd_dev->mapping.size = snap->size; + rbd_dev->mapping.features = snap->features; rbd_dev->mapping.read_only = true; } set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); -done: - return ret; + return 0; } static void rbd_header_free(struct rbd_image_header *header) -- cgit v1.2.3 From e1d4213f090644b06aab6ea70e307ecf16182148 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 25 Apr 2013 23:15:08 -0500 Subject: rbd: set snapshot id in rbd_dev_probe_update_spec() Set the rbd spec's snapshot id for an image getting mapped in rbd_dev_probe_update_spec() rather than rbd_dev_set_mapping(). This is the more logical place for that to happen (even though it means we might look up the snapshot by name twice). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6b1e9a9f2f72..c34f8716d1d2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -846,7 +846,6 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) { if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { - rbd_dev->spec->snap_id = CEPH_NOSNAP; rbd_dev->mapping.size = rbd_dev->header.image_size; rbd_dev->mapping.features = rbd_dev->header.features; } else { @@ -855,7 +854,6 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) snap = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); if (!snap) return -ENOENT; - rbd_dev->spec->snap_id = snap->id; rbd_dev->mapping.size = snap->size; rbd_dev->mapping.features = snap->features; rbd_dev->mapping.read_only = true; @@ -3760,6 +3758,10 @@ out: * rbd_dev_snaps_update() has completed because some of the * information (in particular, snapshot name) is not available * until then. + * + * When an image being mapped (not a parent) is probed, we have the + * pool name and pool id, image name and image id, and the snapshot + * name. The only thing we're missing is the snapshot id. */ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) { @@ -3768,8 +3770,24 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) void *reply_buf = NULL; int ret; - if (rbd_dev->spec->pool_name) - return 0; /* Already have the names */ + /* + * An image being mapped will have the pool name (etc.), but + * we need to look up the snapshot id. + */ + if (rbd_dev->spec->pool_name) { + if (strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME)) { + struct rbd_snap *snap; + + snap = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); + if (!snap) + return -ENOENT; + rbd_dev->spec->snap_id = snap->id; + } else { + rbd_dev->spec->snap_id = CEPH_NOSNAP; + } + + return 0; + } /* Look up the pool name */ -- cgit v1.2.3 From ecb4dc225612e1c0b28d2c1b168422dde4f442a6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 09:43:47 -0500 Subject: rbd: make rbd spec names pointer to const Make the names and image id in an rbd_spec be pointers to constant data. This required the use of a local variable to hold the snapshot name in rbd_add_parse_args() to avoid a warning. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c34f8716d1d2..e728e11096b4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -138,13 +138,13 @@ struct rbd_image_header { */ struct rbd_spec { u64 pool_id; - char *pool_name; + const char *pool_name; - char *image_id; - char *image_name; + const char *image_id; + const char *image_name; u64 snap_id; - char *snap_name; + const char *snap_name; struct kref kref; }; @@ -4375,6 +4375,7 @@ static int rbd_add_parse_args(const char *buf, size_t len; char *options; const char *mon_addrs; + char *snap_name; size_t mon_addrs_size; struct rbd_spec *spec = NULL; struct rbd_options *rbd_opts = NULL; @@ -4433,10 +4434,11 @@ static int rbd_add_parse_args(const char *buf, ret = -ENAMETOOLONG; goto out_err; } - spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); - if (!spec->snap_name) + snap_name = kmemdup(buf, len + 1, GFP_KERNEL); + if (!snap_name) goto out_mem; - *(spec->snap_name + len) = '\0'; + *(snap_name + len) = '\0'; + spec->snap_name = snap_name; /* Initialize all rbd options to the defaults */ -- cgit v1.2.3 From 500d0c0fbb85b59e5e75fc83ff701b7d8aa285f9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 09:43:47 -0500 Subject: rbd: move stripe_unit and stripe_count into header This commit added fetching if fancy striping parameters: 09186ddb rbd: get and check striping parameters They are almost unused, but the two fields storing the information really belonged in the rbd_image_header structure. This patch moves them there. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e728e11096b4..8e56fbd1fcf7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -108,6 +108,9 @@ struct rbd_image_header { char *snap_names; u64 *snap_sizes; + u64 stripe_unit; + u64 stripe_count; + u64 obj_version; }; @@ -316,9 +319,6 @@ struct rbd_device { u64 parent_overlap; struct rbd_device *parent; - u64 stripe_unit; - u64 stripe_count; - /* protects updating the header */ struct rw_semaphore header_rwsem; @@ -3695,8 +3695,8 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) "(got %llu want 1)", stripe_count); return -EINVAL; } - rbd_dev->stripe_unit = stripe_unit; - rbd_dev->stripe_count = stripe_count; + rbd_dev->header.stripe_unit = stripe_unit; + rbd_dev->header.stripe_count = stripe_count; return 0; } -- cgit v1.2.3 From c0cd10db4685a76397f32bed246e861705642576 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 09:43:47 -0500 Subject: rbd: use rbd_warn(), not WARN_ON() Change some calls to WARN_ON() so they use rbd_warn() instead, so we get consistent messaging. A few remain but they can probably just go away eventually. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8e56fbd1fcf7..2e2e9c35b4e5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -777,7 +777,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snap_sizes[i] = le64_to_cpu(ondisk->snaps[i].image_size); } else { - WARN_ON(ondisk->snap_names_len); header->snap_names = NULL; header->snap_sizes = NULL; } @@ -2755,8 +2754,11 @@ static void rbd_request_fn(struct request_queue *q) } result = -EINVAL; - if (WARN_ON(offset && length > U64_MAX - offset + 1)) + if (offset && length > U64_MAX - offset + 1) { + rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", + offset, length); goto end_request; /* Shouldn't happen */ + } result = -ENOMEM; img_request = rbd_img_request_create(rbd_dev, offset, length, @@ -2955,7 +2957,7 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) 0, size, ondisk, version); if (ret < 0) goto out_err; - if (WARN_ON((size_t) ret < size)) { + if ((size_t)ret < size) { ret = -ENXIO; rbd_warn(rbd_dev, "short header read (want %zd got %d)", size, ret); @@ -3057,7 +3059,8 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) rbd_dev->header.snap_names = h.snap_names; rbd_dev->header.snap_sizes = h.snap_sizes; /* Free the extra copy of the object prefix */ - WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); + if (strcmp(rbd_dev->header.object_prefix, h.object_prefix)) + rbd_warn(rbd_dev, "object prefix changed (ignoring)"); kfree(h.object_prefix); ret = rbd_dev_snaps_update(rbd_dev); @@ -3627,8 +3630,11 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) /* The ceph file layout needs to fit pool id in 32 bits */ ret = -EIO; - if (WARN_ON(parent_spec->pool_id > (u64)U32_MAX)) + if (parent_spec->pool_id > (u64)U32_MAX) { + rbd_warn(NULL, "parent pool id too large (%llu > %u)\n", + (unsigned long long)parent_spec->pool_id, U32_MAX); goto out_err; + } image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { @@ -4864,11 +4870,13 @@ static ssize_t rbd_add(struct bus_type *bus, rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); if (rc < 0) goto err_out_client; - spec->pool_id = (u64) rc; + spec->pool_id = (u64)rc; /* The ceph file layout needs to fit pool id in 32 bits */ - if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { + if (spec->pool_id > (u64)U32_MAX) { + rbd_warn(NULL, "pool id too large (%llu > %u)\n", + (unsigned long long)spec->pool_id, U32_MAX); rc = -EIO; goto err_out_client; } @@ -4902,7 +4910,7 @@ err_out_module: dout("Error adding device %s\n", buf); - return (ssize_t) rc; + return (ssize_t)rc; } static struct rbd_device *__rbd_get_dev(unsigned long dev_id) -- cgit v1.2.3 From 468521c1b1450d8e9bda22df9455deaa4feed00f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 09:43:47 -0500 Subject: rbd: define rbd snap context routines Encapsulate the creation of a snapshot context for rbd in a new function rbd_snap_context_create(). Define rbd wrappers for getting and dropping references to them once they're created. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 65 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 26 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2e2e9c35b4e5..b6775ae1a770 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -671,6 +671,35 @@ static void rbd_client_release(struct kref *kref) kfree(rbdc); } +/* Caller has to fill in snapc->seq and snapc->snaps[0..snap_count-1] */ + +static struct ceph_snap_context *rbd_snap_context_create(u32 snap_count) +{ + struct ceph_snap_context *snapc; + size_t size; + + size = sizeof (struct ceph_snap_context); + size += snap_count * sizeof (snapc->snaps[0]); + snapc = kzalloc(size, GFP_KERNEL); + if (!snapc) + return NULL; + + atomic_set(&snapc->nref, 1); + snapc->num_snaps = snap_count; + + return snapc; +} + +static inline void rbd_snap_context_get(struct ceph_snap_context *snapc) +{ + (void)ceph_get_snap_context(snapc); +} + +static inline void rbd_snap_context_put(struct ceph_snap_context *snapc) +{ + ceph_put_snap_context(snapc); +} + /* * Drop reference to ceph client node. If it's not referenced anymore, release * it. @@ -789,18 +818,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header, /* Allocate and fill in the snapshot context */ header->image_size = le64_to_cpu(ondisk->image_size); - size = sizeof (struct ceph_snap_context); - size += snap_count * sizeof (header->snapc->snaps[0]); - header->snapc = kzalloc(size, GFP_KERNEL); + + header->snapc = rbd_snap_context_create(snap_count); if (!header->snapc) goto out_err; - - atomic_set(&header->snapc->nref, 1); header->snapc->seq = le64_to_cpu(ondisk->snap_seq); - header->snapc->num_snaps = snap_count; for (i = 0; i < snap_count; i++) - header->snapc->snaps[i] = - le64_to_cpu(ondisk->snaps[i].id); + header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); return 0; @@ -870,7 +894,7 @@ static void rbd_header_free(struct rbd_image_header *header) header->snap_sizes = NULL; kfree(header->snap_names); header->snap_names = NULL; - ceph_put_snap_context(header->snapc); + rbd_snap_context_put(header->snapc); header->snapc = NULL; } @@ -1720,7 +1744,6 @@ static struct rbd_img_request *rbd_img_request_create( bool child_request) { struct rbd_img_request *img_request; - struct ceph_snap_context *snapc = NULL; img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); if (!img_request) @@ -1728,13 +1751,8 @@ static struct rbd_img_request *rbd_img_request_create( if (write_request) { down_read(&rbd_dev->header_rwsem); - snapc = ceph_get_snap_context(rbd_dev->header.snapc); + rbd_snap_context_get(rbd_dev->header.snapc); up_read(&rbd_dev->header_rwsem); - if (WARN_ON(!snapc)) { - kfree(img_request); - return NULL; /* Shouldn't happen */ - } - } img_request->rq = NULL; @@ -1744,7 +1762,7 @@ static struct rbd_img_request *rbd_img_request_create( img_request->flags = 0; if (write_request) { img_request_write_set(img_request); - img_request->snapc = snapc; + img_request->snapc = rbd_dev->header.snapc; } else { img_request->snap_id = rbd_dev->spec->snap_id; } @@ -1785,7 +1803,7 @@ static void rbd_img_request_destroy(struct kref *kref) rbd_assert(img_request->obj_request_count == 0); if (img_request_write_test(img_request)) - ceph_put_snap_context(img_request->snapc); + rbd_snap_context_put(img_request->snapc); if (img_request_child_test(img_request)) rbd_obj_request_put(img_request->obj_request); @@ -3049,7 +3067,7 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) kfree(rbd_dev->header.snap_sizes); kfree(rbd_dev->header.snap_names); /* osd requests may still refer to snapc */ - ceph_put_snap_context(rbd_dev->header.snapc); + rbd_snap_context_put(rbd_dev->header.snapc); if (hver) *hver = h.obj_version; @@ -3889,19 +3907,14 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) } if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) goto out; + ret = 0; - size = sizeof (struct ceph_snap_context) + - snap_count * sizeof (snapc->snaps[0]); - snapc = kmalloc(size, GFP_KERNEL); + snapc = rbd_snap_context_create(snap_count); if (!snapc) { ret = -ENOMEM; goto out; } - ret = 0; - - atomic_set(&snapc->nref, 1); snapc->seq = seq; - snapc->num_snaps = snap_count; for (i = 0; i < snap_count; i++) snapc->snaps[i] = ceph_decode_64(&p); -- cgit v1.2.3 From 9f5dffdc8f5dbc16493566b6aac59f275d5cb3f9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 09:43:47 -0500 Subject: rbd: make rbd_dev_destroy() match rbd_dev_create() Currently, rbd_dev_destroy() does more than just the inverse of what rbd_dev_create() does. Stop doing that, and move the two extra things it does into the three call sites. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b6775ae1a770..e6dab9f7dd75 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3425,8 +3425,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, static void rbd_dev_destroy(struct rbd_device *rbd_dev) { - rbd_spec_put(rbd_dev->parent_spec); - kfree(rbd_dev->header_name); rbd_put_client(rbd_dev->rbd_client); rbd_spec_put(rbd_dev->spec); kfree(rbd_dev); @@ -4788,6 +4786,8 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) return ret; err_out_parent: + rbd_spec_put(rbd_dev->parent_spec); + kfree(rbd_dev->header_name); rbd_dev_destroy(parent); err_out_spec: rbd_spec_put(parent_spec); @@ -4910,6 +4910,8 @@ static ssize_t rbd_add(struct bus_type *bus, return count; err_out_rbd_dev: + rbd_spec_put(rbd_dev->parent_spec); + kfree(rbd_dev->header_name); rbd_dev_destroy(rbd_dev); err_out_client: rbd_put_client(rbdc); @@ -4960,6 +4962,8 @@ static void rbd_dev_release(struct device *dev) /* done with the id, and with the rbd_dev */ rbd_dev_id_put(rbd_dev); rbd_assert(rbd_dev->rbd_client != NULL); + rbd_spec_put(rbd_dev->parent_spec); + kfree(rbd_dev->header_name); rbd_dev_destroy(rbd_dev); /* release module ref */ -- cgit v1.2.3 From 71f293e26e760c4151e00b8f611e67da222f89c7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 09:43:48 -0500 Subject: rbd: rename rbd_dev_probe() Rename rbd_dev_probe() to be rbd_dev_image_probe(). Its purpose will eventually be to probe for the existence of a valid rbd image for the rbd device--focusing only on the ceph side and not the Linux device side of initialization. For now the two "sides" are not fully separated, and this function is still the entry point for initializing the full rbd device. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e6dab9f7dd75..09062c48705b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -365,7 +365,7 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); -static int rbd_dev_probe(struct rbd_device *rbd_dev); +static int rbd_dev_image_probe(struct rbd_device *rbd_dev); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -4766,7 +4766,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) } rbdc = NULL; /* parent now owns reference */ parent_spec = NULL; /* parent now owns reference */ - ret = rbd_dev_probe(parent); + ret = rbd_dev_image_probe(parent); if (ret < 0) goto err_out_parent; rbd_dev->parent = parent; @@ -4815,7 +4815,7 @@ err_out_snaps: * device. For format 2 images this includes determining the image * id. */ -static int rbd_dev_probe(struct rbd_device *rbd_dev) +static int rbd_dev_image_probe(struct rbd_device *rbd_dev) { int ret; @@ -4904,7 +4904,7 @@ static ssize_t rbd_add(struct bus_type *bus, kfree(rbd_opts); rbd_opts = NULL; /* done with this */ - rc = rbd_dev_probe(rbd_dev); + rc = rbd_dev_image_probe(rbd_dev); if (rc < 0) goto err_out_rbd_dev; -- cgit v1.2.3 From 2e9f7f1c0de23156e225046f10fad939a4017e97 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 09:43:48 -0500 Subject: rbd: refactor rbd_dev_probe_update_spec() Fairly straightforward refactoring of rbd_dev_probe_update_spec(). The name is changed to rbd_dev_spec_update(). Rearrange it so nothing gets assigned to the spec until all of the names have been successfully acquired. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 83 ++++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 39 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 09062c48705b..3bd12ead5091 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3774,83 +3774,88 @@ out: } /* - * When a parent image gets probed, we only have the pool, image, - * and snapshot ids but not the names of any of them. This call - * is made later to fill in those names. It has to be done after - * rbd_dev_snaps_update() has completed because some of the - * information (in particular, snapshot name) is not available - * until then. + * When an rbd image has a parent image, it is identified by the + * pool, image, and snapshot ids (not names). This function fills + * in the names for those ids. (It's OK if we can't figure out the + * name for an image id, but the pool and snapshot ids should always + * exist and have names.) All names in an rbd spec are dynamically + * allocated. * * When an image being mapped (not a parent) is probed, we have the * pool name and pool id, image name and image id, and the snapshot * name. The only thing we're missing is the snapshot id. + * + * The set of snapshots for an image is not known until they have + * been read by rbd_dev_snaps_update(), so we can't completely fill + * in this information until after that has been called. */ -static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) +static int rbd_dev_spec_update(struct rbd_device *rbd_dev) { - struct ceph_osd_client *osdc; - const char *name; - void *reply_buf = NULL; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_spec *spec = rbd_dev->spec; + const char *pool_name; + const char *image_name; + const char *snap_name; int ret; /* * An image being mapped will have the pool name (etc.), but * we need to look up the snapshot id. */ - if (rbd_dev->spec->pool_name) { - if (strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME)) { + if (spec->pool_name) { + if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { struct rbd_snap *snap; - snap = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); + snap = snap_by_name(rbd_dev, spec->snap_name); if (!snap) return -ENOENT; - rbd_dev->spec->snap_id = snap->id; + spec->snap_id = snap->id; } else { - rbd_dev->spec->snap_id = CEPH_NOSNAP; + spec->snap_id = CEPH_NOSNAP; } return 0; } - /* Look up the pool name */ + /* Get the pool name; we have to make our own copy of this */ - osdc = &rbd_dev->rbd_client->client->osdc; - name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); - if (!name) { - rbd_warn(rbd_dev, "there is no pool with id %llu", - rbd_dev->spec->pool_id); /* Really a BUG() */ + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); + if (!pool_name) { + rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); return -EIO; } - - rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); - if (!rbd_dev->spec->pool_name) + pool_name = kstrdup(pool_name, GFP_KERNEL); + if (!pool_name) return -ENOMEM; /* Fetch the image name; tolerate failure here */ - name = rbd_dev_image_name(rbd_dev); - if (name) - rbd_dev->spec->image_name = (char *)name; - else + image_name = rbd_dev_image_name(rbd_dev); + if (!image_name) rbd_warn(rbd_dev, "unable to get image name"); - /* Look up the snapshot name. */ + /* Look up the snapshot name, and make a copy */ - name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); - if (!name) { - rbd_warn(rbd_dev, "no snapshot with id %llu", - rbd_dev->spec->snap_id); /* Really a BUG() */ + snap_name = rbd_snap_name(rbd_dev, spec->snap_id); + if (!snap_name) { + rbd_warn(rbd_dev, "no snapshot with id %llu", spec->snap_id); ret = -EIO; goto out_err; } - rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL); - if(!rbd_dev->spec->snap_name) + snap_name = kstrdup(snap_name, GFP_KERNEL); + if (!snap_name) { + ret = -ENOMEM; goto out_err; + } + + spec->pool_name = pool_name; + spec->image_name = image_name; + spec->snap_name = snap_name; return 0; out_err: - kfree(reply_buf); - kfree(rbd_dev->spec->pool_name); - rbd_dev->spec->pool_name = NULL; + kfree(image_name); + kfree(pool_name); return ret; } @@ -4710,7 +4715,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) return ret; - ret = rbd_dev_probe_update_spec(rbd_dev); + ret = rbd_dev_spec_update(rbd_dev); if (ret) goto err_out_snaps; -- cgit v1.2.3 From e28626a08b3e7412158551a639dd36887e2d728d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 15:44:35 -0500 Subject: rbd: fix a bug in resizing a mapping When a snapshot context update occurs, rbd_update_mapping_size() is called to set the capacity of the disk to record the updated size of the image in case it has changed. There's a bug though. The mapping size is in units of *bytes*. The code that updates the mapping size field is assigning a value that has been scaled down to *sectors*. Fix that. Also, check to see if the size has actually changed, and don't bother updating things (specifically, calling set_capacity()) if it has not. This resolves: http://tracker.ceph.com/issues/4833 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3bd12ead5091..83265adab19c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3034,15 +3034,17 @@ static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) static void rbd_update_mapping_size(struct rbd_device *rbd_dev) { - sector_t size; - if (rbd_dev->spec->snap_id != CEPH_NOSNAP) return; - size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE; - dout("setting size to %llu sectors", (unsigned long long) size); - rbd_dev->mapping.size = (u64) size; - set_capacity(rbd_dev->disk, size); + if (rbd_dev->mapping.size != rbd_dev->header.image_size) { + sector_t size; + + rbd_dev->mapping.size = rbd_dev->header.image_size; + size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; + dout("setting size to %llu sectors", (unsigned long long)size); + set_capacity(rbd_dev->disk, size); + } } /* -- cgit v1.2.3 From fc71d8330e39ef3af816a9c869150250952cb712 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 15:44:36 -0500 Subject: rbd: fix up some sysfs stuff This just tweaks a few things in the routines that implement rbd sysfs files. All of the entries for an rbd device in /sys/bus/rbd/devices// will represent information whose valid values are known by the time they are accessible. Right now we get the size of the mapped image by a call to get_capacity(). There's no need to do this, because that will return what we last set the capacity to, which is just the size recorded for the mapping. So just show that value instead. We also get this under protection of the header semaphore, in order to provide a precisely correct value. This isn't really necessary; these files are really informational only and it's not necessary to be so careful. Finally, print a special value in case the major device number is not recorded. Right now that won't matter much but soon the parent images won't have devices associated with them. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 83265adab19c..65d021be6c9e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3170,13 +3170,9 @@ static ssize_t rbd_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - sector_t size; - down_read(&rbd_dev->header_rwsem); - size = get_capacity(rbd_dev->disk); - up_read(&rbd_dev->header_rwsem); - - return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); + return sprintf(buf, "%llu\n", + (unsigned long long)rbd_dev->mapping.size); } /* @@ -3189,7 +3185,7 @@ static ssize_t rbd_features_show(struct device *dev, struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "0x%016llx\n", - (unsigned long long) rbd_dev->mapping.features); + (unsigned long long)rbd_dev->mapping.features); } static ssize_t rbd_major_show(struct device *dev, @@ -3197,7 +3193,11 @@ static ssize_t rbd_major_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%d\n", rbd_dev->major); + if (rbd_dev->major) + return sprintf(buf, "%d\n", rbd_dev->major); + + return sprintf(buf, "(none)\n"); + } static ssize_t rbd_client_id_show(struct device *dev, @@ -3223,7 +3223,7 @@ static ssize_t rbd_pool_id_show(struct device *dev, struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); return sprintf(buf, "%llu\n", - (unsigned long long) rbd_dev->spec->pool_id); + (unsigned long long) rbd_dev->spec->pool_id); } static ssize_t rbd_name_show(struct device *dev, -- cgit v1.2.3 From 129b79d4498581e52175ac5c3ef2168f616b0e5e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 15:44:36 -0500 Subject: rbd: only set device exists flag when ready Hold off setting the EXISTS rbd device flag until just before we announce the disk as available for use. There's no point in doing so any earlier than that, and at that point the device truly is fully set up and ready to use. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 65d021be6c9e..f84a11ed25a4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -881,7 +881,6 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) rbd_dev->mapping.features = snap->features; rbd_dev->mapping.read_only = true; } - set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); return 0; } @@ -4785,6 +4784,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) /* Everything's ready. Announce the disk to the world. */ + set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); add_disk(rbd_dev->disk); pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, -- cgit v1.2.3 From b5156e76da01c23e14e962594553f1735b1db298 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 15:44:36 -0500 Subject: rbd: defer setting disk capacity Don't set the disk capacity until right before we announce the device as available for use. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f84a11ed25a4..b6024a2d7b86 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3147,8 +3147,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->disk = disk; - set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); - return 0; out_disk: put_disk(disk); @@ -4784,6 +4782,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) /* Everything's ready. Announce the disk to the world. */ + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); add_disk(rbd_dev->disk); -- cgit v1.2.3 From 124afba25d58e2b52d7d4bad993065572a28d57f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 15:44:36 -0500 Subject: rbd: encapsulate probing for parent devices Encapsulate the code that probes for an rbd device's parent images into a new function, rbd_dev_probe_parent(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 82 ++++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 38 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b6024a2d7b86..c80fc1a3a604 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4702,11 +4702,49 @@ out_err: return ret; } -static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) +static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) { struct rbd_device *parent = NULL; - struct rbd_spec *parent_spec = NULL; - struct rbd_client *rbdc = NULL; + struct rbd_spec *parent_spec; + struct rbd_client *rbdc; + int ret; + + if (!rbd_dev->parent_spec) + return 0; + /* + * We need to pass a reference to the client and the parent + * spec when creating the parent rbd_dev. Images related by + * parent/child relationships always share both. + */ + parent_spec = rbd_spec_get(rbd_dev->parent_spec); + rbdc = __rbd_get_client(rbd_dev->rbd_client); + + ret = -ENOMEM; + parent = rbd_dev_create(rbdc, parent_spec); + if (!parent) + goto out_err; + + ret = rbd_dev_image_probe(parent); + if (ret < 0) + goto out_err; + rbd_dev->parent = parent; + + return 0; +out_err: + if (parent) { + rbd_spec_put(rbd_dev->parent_spec); + kfree(rbd_dev->header_name); + rbd_dev_destroy(parent); + } else { + rbd_put_client(rbdc); + rbd_spec_put(parent_spec); + } + + return ret; +} + +static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) +{ int ret; /* no need to lock here, as rbd_dev is not registered yet */ @@ -4747,34 +4785,9 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_disk; - /* - * At this point cleanup in the event of an error is the job - * of the sysfs code (initiated by rbd_bus_del_dev()). - */ - /* Probe the parent if there is one */ - - if (rbd_dev->parent_spec) { - /* - * We need to pass a reference to the client and the - * parent spec when creating the parent rbd_dev. - * Images related by parent/child relationships - * always share both. - */ - parent_spec = rbd_spec_get(rbd_dev->parent_spec); - rbdc = __rbd_get_client(rbd_dev->rbd_client); - - parent = rbd_dev_create(rbdc, parent_spec); - if (!parent) { - ret = -ENOMEM; - goto err_out_spec; - } - rbdc = NULL; /* parent now owns reference */ - parent_spec = NULL; /* parent now owns reference */ - ret = rbd_dev_image_probe(parent); - if (ret < 0) - goto err_out_parent; - rbd_dev->parent = parent; - } + ret = rbd_dev_probe_parent(rbd_dev); + if (ret) + goto err_out_bus; ret = rbd_dev_header_watch_sync(rbd_dev, 1); if (ret) @@ -4791,13 +4804,6 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) return ret; -err_out_parent: - rbd_spec_put(rbd_dev->parent_spec); - kfree(rbd_dev->header_name); - rbd_dev_destroy(parent); -err_out_spec: - rbd_spec_put(parent_spec); - rbd_put_client(rbdc); err_out_bus: /* this will also clean up rest of rbd_dev stuff */ -- cgit v1.2.3 From 05a46afdc7f0f73d42dcecd8ee80f9558b4c38f7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 15:44:36 -0500 Subject: rbd: encapsulate removing parent devices Encapsulate the code that removes an rbd device's parent images into a new function, rbd_dev_remove_parent(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c80fc1a3a604..87ef01189b83 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -427,8 +427,9 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) # define rbd_assert(expr) ((void) 0) #endif /* !RBD_DEBUG */ -static void rbd_img_parent_read(struct rbd_obj_request *obj_request); static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); +static void rbd_img_parent_read(struct rbd_obj_request *obj_request); +static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); @@ -4988,6 +4989,29 @@ static void __rbd_remove(struct rbd_device *rbd_dev) rbd_bus_del_dev(rbd_dev); } +static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) +{ + while (rbd_dev->parent_spec) { + struct rbd_device *first = rbd_dev; + struct rbd_device *second = first->parent; + struct rbd_device *third; + + /* + * Follow to the parent with no grandparent and + * remove it. + */ + while (second && (third = second->parent)) { + first = second; + second = third; + } + __rbd_remove(second); + rbd_spec_put(first->parent_spec); + first->parent_spec = NULL; + first->parent_overlap = 0; + first->parent = NULL; + } +} + static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count) @@ -5023,25 +5047,8 @@ static ssize_t rbd_remove(struct bus_type *bus, if (ret < 0) goto done; - while (rbd_dev->parent_spec) { - struct rbd_device *first = rbd_dev; - struct rbd_device *second = first->parent; - struct rbd_device *third; + rbd_dev_remove_parent(rbd_dev); - /* - * Follow to the parent with no grandparent and - * remove it. - */ - while (second && (third = second->parent)) { - first = second; - second = third; - } - __rbd_remove(second); - rbd_spec_put(first->parent_spec); - first->parent_spec = NULL; - first->parent_overlap = 0; - first->parent = NULL; - } __rbd_remove(rbd_dev); done: -- cgit v1.2.3 From d1cf5788450e1781f63a0626a854fe8309b32cb1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 27 Apr 2013 09:59:30 -0500 Subject: rbd: set mapping info earlier Set the mapping size and features earlier in rbd_dev_probe_finish(). Define rbd_dev_mapping_clear() as an inverse for setting those fields, and use it both in error handling in rbd_dev_image_probe() and in the final cleanup in rbd_dev_release(). Change the name of rbd_dev_set_mapping() to of rbd_dev_mapping_set(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 87ef01189b83..98e0b8c3def8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -866,7 +866,7 @@ static struct rbd_snap *snap_by_name(struct rbd_device *rbd_dev, return NULL; } -static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) +static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) { if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { @@ -886,6 +886,13 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) return 0; } +static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) +{ + rbd_dev->mapping.size = 0; + rbd_dev->mapping.features = 0; + rbd_dev->mapping.read_only = true; +} + static void rbd_header_free(struct rbd_image_header *header) { kfree(header->object_prefix); @@ -4757,7 +4764,11 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_snaps; - ret = rbd_dev_set_mapping(rbd_dev); + ret = rbd_dev_header_watch_sync(rbd_dev, 1); + if (ret) + goto err_out_snaps; + + ret = rbd_dev_mapping_set(rbd_dev); if (ret) goto err_out_snaps; @@ -4790,10 +4801,6 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_bus; - ret = rbd_dev_header_watch_sync(rbd_dev, 1); - if (ret) - goto err_out_bus; - /* Everything's ready. Announce the disk to the world. */ set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); @@ -4817,6 +4824,7 @@ err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); + rbd_dev_mapping_clear(rbd_dev); err_out_snaps: rbd_remove_all_snaps(rbd_dev); @@ -4974,6 +4982,7 @@ static void rbd_dev_release(struct device *dev) /* done with the id, and with the rbd_dev */ rbd_dev_id_put(rbd_dev); + rbd_dev_mapping_clear(rbd_dev); rbd_assert(rbd_dev->rbd_client != NULL); rbd_spec_put(rbd_dev->parent_spec); kfree(rbd_dev->header_name); -- cgit v1.2.3 From b480815a17bc6bfe85d4931c53e5a8fded7f889e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 15:44:36 -0500 Subject: rbd: kill __rbd_remove() The function __rbd_remove() is used in two spots, and it's fairly simple. It combines cleanup of part of the ceph-side state as well as cleaning up the Linux-side state. Just open code it in the two callers and eliminate the function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 98e0b8c3def8..0bae4e74555d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4992,12 +4992,6 @@ static void rbd_dev_release(struct device *dev) module_put(THIS_MODULE); } -static void __rbd_remove(struct rbd_device *rbd_dev) -{ - rbd_remove_all_snaps(rbd_dev); - rbd_bus_del_dev(rbd_dev); -} - static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) { while (rbd_dev->parent_spec) { @@ -5013,7 +5007,8 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) first = second; second = third; } - __rbd_remove(second); + rbd_remove_all_snaps(second); + rbd_bus_del_dev(second); rbd_spec_put(first->parent_spec); first->parent_spec = NULL; first->parent_overlap = 0; @@ -5058,8 +5053,8 @@ static ssize_t rbd_remove(struct bus_type *bus, rbd_dev_remove_parent(rbd_dev); - __rbd_remove(rbd_dev); - + rbd_remove_all_snaps(rbd_dev); + rbd_bus_del_dev(rbd_dev); done: mutex_unlock(&ctl_mutex); -- cgit v1.2.3 From ad945fc1da42965a31089d29de3754047861f348 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 15:44:36 -0500 Subject: rbd: fix rbd_dev_remove_parent() In certain error paths, it is possible for an rbd device to have a parent spec but no parent rbd_dev. In rbd_dev_remove_parent() use the parent field rather than parent_spec in determining whether to try to remove any parent devices. Use assertions to indicate that any non-null parent pointer has parent_spec associated with it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0bae4e74555d..bc1e6e8e2ad9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4994,7 +4994,7 @@ static void rbd_dev_release(struct device *dev) static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) { - while (rbd_dev->parent_spec) { + while (rbd_dev->parent) { struct rbd_device *first = rbd_dev; struct rbd_device *second = first->parent; struct rbd_device *third; @@ -5007,12 +5007,15 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) first = second; second = third; } + rbd_assert(second); rbd_remove_all_snaps(second); rbd_bus_del_dev(second); + first->parent = NULL; + first->parent_overlap = 0; + + rbd_assert(first->parent_spec); rbd_spec_put(first->parent_spec); first->parent_spec = NULL; - first->parent_overlap = 0; - first->parent = NULL; } } -- cgit v1.2.3 From 2e93bf9e465b7d0ccf703fb791c663435d9522cf Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 15:44:36 -0500 Subject: rbd: remove parent devices on probe error When an error occurs while finishing probing a device it is assumed that parent devices get cleaned up when deleting a device. They don't. Add a call to clean them up. Note that this means the parent spec will already be cleaned up so it doesn't have to be in one of the rbd_add() error paths. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bc1e6e8e2ad9..eed7029b8ee8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4813,8 +4813,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) return ret; err_out_bus: - /* this will also clean up rest of rbd_dev stuff */ - + rbd_dev_remove_parent(rbd_dev); rbd_bus_del_dev(rbd_dev); return ret; @@ -4931,7 +4930,6 @@ static ssize_t rbd_add(struct bus_type *bus, return count; err_out_rbd_dev: - rbd_spec_put(rbd_dev->parent_spec); kfree(rbd_dev->header_name); rbd_dev_destroy(rbd_dev); err_out_client: -- cgit v1.2.3 From 5de10f3b0c99983e3f9ec19baa1eb691685d9b8f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 26 Apr 2013 15:44:37 -0500 Subject: rbd: probe for the parent earlier Probe for a parent device earlier in rbd_dev_probe_finish(), before starting to set up the Linux side of the rbd device. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index eed7029b8ee8..e86238c90677 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4772,6 +4772,10 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_snaps; + ret = rbd_dev_probe_parent(rbd_dev); + if (ret) + goto err_out_mapping; + /* generate unique id: find highest unique id, add one */ rbd_dev_id_get(rbd_dev); @@ -4797,10 +4801,6 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_disk; - ret = rbd_dev_probe_parent(rbd_dev); - if (ret) - goto err_out_bus; - /* Everything's ready. Announce the disk to the world. */ set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); @@ -4812,17 +4812,14 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) return ret; -err_out_bus: - rbd_dev_remove_parent(rbd_dev); - rbd_bus_del_dev(rbd_dev); - - return ret; err_out_disk: rbd_free_disk(rbd_dev); err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); + rbd_dev_remove_parent(rbd_dev); +err_out_mapping: rbd_dev_mapping_clear(rbd_dev); err_out_snaps: rbd_remove_all_snaps(rbd_dev); -- cgit v1.2.3 From 9bb81c9be90c1ad265547f0a40f543548d263fb4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 27 Apr 2013 09:59:30 -0500 Subject: rbd: move more initialization into rbd_dev_image_probe() Move a block of initialization related to the "ceph-side" of an rbd image out of rbd_dev_probe_finish() and into rbd_dev_image_probe(). Add appropriate error handling to clean things up in the event any of these new functions return an error. We know that rbd_dev_snaps_update(), rbd_dev_spec_update(), and rbd_dev_probe_parent() all clean up after themselves before they return an error, so no special cleanup is required except when an earlier call succeeds. Since rbd_dev_spec_update() only updates the spec field (whose cleanup will be handled by dropping the last reference to the spec) there is no cleanup action associatied with that. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e86238c90677..ebf4d470e13f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4755,26 +4755,13 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) { int ret; - /* no need to lock here, as rbd_dev is not registered yet */ - ret = rbd_dev_snaps_update(rbd_dev); - if (ret) - return ret; - - ret = rbd_dev_spec_update(rbd_dev); - if (ret) - goto err_out_snaps; - ret = rbd_dev_header_watch_sync(rbd_dev, 1); if (ret) - goto err_out_snaps; + return ret; ret = rbd_dev_mapping_set(rbd_dev); if (ret) - goto err_out_snaps; - - ret = rbd_dev_probe_parent(rbd_dev); - if (ret) - goto err_out_mapping; + return ret; /* generate unique id: find highest unique id, add one */ rbd_dev_id_get(rbd_dev); @@ -4818,11 +4805,7 @@ err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); - rbd_dev_remove_parent(rbd_dev); -err_out_mapping: rbd_dev_mapping_clear(rbd_dev); -err_out_snaps: - rbd_remove_all_snaps(rbd_dev); return ret; } @@ -4854,11 +4837,28 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) if (ret) goto out_err; + ret = rbd_dev_snaps_update(rbd_dev); + if (ret) + goto out_err; + + ret = rbd_dev_spec_update(rbd_dev); + if (ret) + goto err_out_snaps; + + ret = rbd_dev_probe_parent(rbd_dev); + if (ret) + goto err_out_snaps; + ret = rbd_dev_probe_finish(rbd_dev); if (ret) - rbd_header_free(&rbd_dev->header); + goto err_out_parent; return ret; +err_out_parent: + rbd_dev_remove_parent(rbd_dev); + rbd_header_free(&rbd_dev->header); +err_out_snaps: + rbd_remove_all_snaps(rbd_dev); out_err: kfree(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; -- cgit v1.2.3 From 332bb12db9459d52dfcdb278e7607351d2eff6ab Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 27 Apr 2013 09:59:30 -0500 Subject: rbd: define rbd_header_name() Define a new function rbd_header_name(), which allocates and formats the name of the header object for the rbd device. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 66 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 28 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ebf4d470e13f..44739640d94f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4592,18 +4592,6 @@ out: static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) { int ret; - size_t size; - - /* Record the header object name for this rbd image. */ - - size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); - rbd_dev->header_name = kmalloc(size, GFP_KERNEL); - if (!rbd_dev->header_name) { - ret = -ENOMEM; - goto out_err; - } - sprintf(rbd_dev->header_name, "%s%s", - rbd_dev->spec->image_name, RBD_SUFFIX); /* Populate rbd image metadata */ @@ -4632,22 +4620,9 @@ out_err: static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) { - size_t size; int ret; u64 ver = 0; - /* - * Image id was filled in by the caller. Record the header - * object name for this rbd image. - */ - size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); - rbd_dev->header_name = kmalloc(size, GFP_KERNEL); - if (!rbd_dev->header_name) - return -ENOMEM; - sprintf(rbd_dev->header_name, "%s%s", - RBD_HEADER_PREFIX, rbd_dev->spec->image_id); - - /* Get the size and object order for the image */ ret = rbd_dev_v2_image_size(rbd_dev); if (ret) goto out_err; @@ -4810,6 +4785,33 @@ err_out_id: return ret; } +static int rbd_dev_header_name(struct rbd_device *rbd_dev) +{ + struct rbd_spec *spec = rbd_dev->spec; + size_t size; + + /* Record the header object name for this rbd image. */ + + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + + if (rbd_dev->image_format == 1) + size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); + else + size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); + + rbd_dev->header_name = kmalloc(size, GFP_KERNEL); + if (!rbd_dev->header_name) + return -ENOMEM; + + if (rbd_dev->image_format == 1) + sprintf(rbd_dev->header_name, "%s%s", + spec->image_name, RBD_SUFFIX); + else + sprintf(rbd_dev->header_name, "%s%s", + RBD_HEADER_PREFIX, spec->image_id); + return 0; +} + /* * Probe for the existence of the header object for the given rbd * device. For format 2 images this includes determining the image @@ -4830,16 +4832,20 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) rbd_assert(rbd_dev->spec->image_id); rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + ret = rbd_dev_header_name(rbd_dev); + if (ret) + goto err_out_format; + if (rbd_dev->image_format == 1) ret = rbd_dev_v1_probe(rbd_dev); else ret = rbd_dev_v2_probe(rbd_dev); if (ret) - goto out_err; + goto out_header_name; ret = rbd_dev_snaps_update(rbd_dev); if (ret) - goto out_err; + goto out_header_name; ret = rbd_dev_spec_update(rbd_dev); if (ret) @@ -4859,7 +4865,11 @@ err_out_parent: rbd_header_free(&rbd_dev->header); err_out_snaps: rbd_remove_all_snaps(rbd_dev); -out_err: +out_header_name: + kfree(rbd_dev->header_name); + rbd_dev->header_name = NULL; +err_out_format: + rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; -- cgit v1.2.3 From 0d8189e175380c029a309f05f44e82bacf1c0404 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 27 Apr 2013 09:59:30 -0500 Subject: rbd: don't clean up watch in device release function Currently, a watch on an rbd device header object gets torn down when its final Linux device reference gets dropped. Instead, tear it down when removing the device. If an error occurs cleaning up the watch event when unmapping, abort the unmap request. All images (including parents) still get watch requests set up, so tear these down also, in rbd_dev_remove_parent(). For now, ignore any errors that occur in this case. Get rid of local variable "rc" in rbd_remove(); use "ret" instead (they both somehow ended up defined in the function and only one is needed). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 44739640d94f..738263f354f6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4729,6 +4729,7 @@ out_err: static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) { int ret; + int tmp; ret = rbd_dev_header_watch_sync(rbd_dev, 1); if (ret) @@ -4780,6 +4781,9 @@ err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); + tmp = rbd_dev_header_watch_sync(rbd_dev, 0); + if (tmp) + rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); rbd_dev_mapping_clear(rbd_dev); return ret; @@ -4975,9 +4979,6 @@ static void rbd_dev_release(struct device *dev) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - if (rbd_dev->watch_event) - rbd_dev_header_watch_sync(rbd_dev, 0); - /* clean up and free blkdev */ rbd_free_disk(rbd_dev); unregister_blkdev(rbd_dev->major, rbd_dev->name); @@ -5003,6 +5004,7 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) struct rbd_device *first = rbd_dev; struct rbd_device *second = first->parent; struct rbd_device *third; + int ret; /* * Follow to the parent with no grandparent and @@ -5013,6 +5015,10 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) second = third; } rbd_assert(second); + ret = rbd_dev_header_watch_sync(rbd_dev, 0); + if (ret) + rbd_warn(rbd_dev, + "failed to cancel watch event (%d)\n", ret); rbd_remove_all_snaps(second); rbd_bus_del_dev(second); first->parent = NULL; @@ -5029,13 +5035,13 @@ static ssize_t rbd_remove(struct bus_type *bus, size_t count) { struct rbd_device *rbd_dev = NULL; - int target_id, rc; + int target_id; unsigned long ul; - int ret = count; + int ret; - rc = strict_strtoul(buf, 10, &ul); - if (rc) - return rc; + ret = strict_strtoul(buf, 10, &ul); + if (ret) + return ret; /* convert to int; abort if we lost anything in the conversion */ target_id = (int) ul; @@ -5059,6 +5065,15 @@ static ssize_t rbd_remove(struct bus_type *bus, if (ret < 0) goto done; + ret = rbd_dev_header_watch_sync(rbd_dev, 0); + if (ret) { + rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); + clear_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); + smp_mb(); + return ret; + } + ret = count; + rbd_dev_remove_parent(rbd_dev); rbd_remove_all_snaps(rbd_dev); -- cgit v1.2.3 From 96f03e08f9f27cf72d2c24b4e75ade81d2df3c75 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 27 Apr 2013 09:59:31 -0500 Subject: rbd: don't bother checking whether order changes When a format 2 image is refreshed, code is in place to verify that the object order never changes from what it was originally. This relies on the fact that the refresh will occur *after* an initial load of information about the image. An upcoming patch makes it possible for the refresh to occur first, so we can no longer make this order check. The order really can't ever change anyway--this was just a sanity check. So get rid of it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 738263f354f6..52c722b471e4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4024,20 +4024,12 @@ static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) { int ret; - __u8 obj_order; down_write(&rbd_dev->header_rwsem); - /* Grab old order first, to see if it changes */ - - obj_order = rbd_dev->header.obj_order, ret = rbd_dev_v2_image_size(rbd_dev); if (ret) goto out; - if (rbd_dev->header.obj_order != obj_order) { - ret = -EIO; - goto out; - } rbd_update_mapping_size(rbd_dev); ret = rbd_dev_v2_snap_context(rbd_dev, hver); -- cgit v1.2.3 From b644de2ba0c5b590db9195c03358ccd0f061daa6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 27 Apr 2013 09:59:31 -0500 Subject: rbd: set up watch in rbd_dev_image_probe() Move setting up the watch request for an image so it's done in rbd_dev_image_probe() rather than rbd_dev_probe_finish(). Move it all the way up to before doing the initial probe. This avoids a potential race condition, in which we get (and use) the initial snapshot context for an image, and it gets changed between that time and the time we get the watch set up. This resolves: http://tracker.ceph.com/issues/3871 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 52c722b471e4..ac94aa4b4d22 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4721,11 +4721,6 @@ out_err: static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) { int ret; - int tmp; - - ret = rbd_dev_header_watch_sync(rbd_dev, 1); - if (ret) - return ret; ret = rbd_dev_mapping_set(rbd_dev); if (ret) @@ -4773,9 +4768,6 @@ err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); - tmp = rbd_dev_header_watch_sync(rbd_dev, 0); - if (tmp) - rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); rbd_dev_mapping_clear(rbd_dev); return ret; @@ -4816,6 +4808,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) static int rbd_dev_image_probe(struct rbd_device *rbd_dev) { int ret; + int tmp; /* * Get the id from the image id object. If it's not a @@ -4832,16 +4825,20 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) if (ret) goto err_out_format; + ret = rbd_dev_header_watch_sync(rbd_dev, 1); + if (ret) + goto out_header_name; + if (rbd_dev->image_format == 1) ret = rbd_dev_v1_probe(rbd_dev); else ret = rbd_dev_v2_probe(rbd_dev); if (ret) - goto out_header_name; + goto err_out_watch; ret = rbd_dev_snaps_update(rbd_dev); if (ret) - goto out_header_name; + goto err_out_watch; ret = rbd_dev_spec_update(rbd_dev); if (ret) @@ -4861,6 +4858,10 @@ err_out_parent: rbd_header_free(&rbd_dev->header); err_out_snaps: rbd_remove_all_snaps(rbd_dev); +err_out_watch: + tmp = rbd_dev_header_watch_sync(rbd_dev, 0); + if (tmp) + rbd_warn(rbd_dev, "unable to tear down watch request\n"); out_header_name: kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; -- cgit v1.2.3 From 79ab7558aac7622109e9d9b089cac2c5f06aca20 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 28 Apr 2013 23:32:34 -0500 Subject: rbd: drop module later Drop the module reference at the end of rbd_remove() for symmetry with adding a reference at the top of rbd_add(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ac94aa4b4d22..59048191ab17 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4986,9 +4986,6 @@ static void rbd_dev_release(struct device *dev) rbd_spec_put(rbd_dev->parent_spec); kfree(rbd_dev->header_name); rbd_dev_destroy(rbd_dev); - - /* release module ref */ - module_put(THIS_MODULE); } static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) @@ -5071,6 +5068,7 @@ static ssize_t rbd_remove(struct bus_type *bus, rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); + module_put(THIS_MODULE); done: mutex_unlock(&ctl_mutex); -- cgit v1.2.3 From 200a6a8be5dba96df121f3d2363964dd77ee7e1b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 28 Apr 2013 23:32:34 -0500 Subject: rbd: don't destroy rbd_dev in device release function Rename rbd_dev_probe_finish() to be rbd_dev_device_setup(). Its purpose is to set up the Linux side of an rbd device mapping. Rename rbd_dev_release() to be rbd_dev_device_release(), making it more obvious it serves as the inverse of the setup function (or it will). Encapsulate some of what was done in rbd_dev_release() into a new function rbd_dev_image_release(), which serves as the inverse of setting up the ceph side of the mapped rbd image. Define a new helper rbd_dev_clear_mapping() to simply zero out the fields of a mapping structure--the inverse of rbd_dev_set_mapping(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 59048191ab17..feaa2e9192a1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -358,7 +358,7 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request); static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); -static void rbd_dev_release(struct device *dev); +static void rbd_dev_device_release(struct device *dev); static void rbd_snap_destroy(struct rbd_snap *snap); static ssize_t rbd_add(struct bus_type *bus, const char *buf, @@ -893,6 +893,13 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) rbd_dev->mapping.read_only = true; } +static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev) +{ + rbd_dev->mapping.size = 0; + rbd_dev->mapping.features = 0; + rbd_dev->mapping.read_only = true; +} + static void rbd_header_free(struct rbd_image_header *header) { kfree(header->object_prefix); @@ -4182,7 +4189,7 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) dev->bus = &rbd_bus_type; dev->type = &rbd_device_type; dev->parent = &rbd_root_dev; - dev->release = rbd_dev_release; + dev->release = rbd_dev_device_release; dev_set_name(dev, "%d", rbd_dev->dev_id); ret = device_register(dev); @@ -4718,7 +4725,7 @@ out_err: return ret; } -static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) +static int rbd_dev_device_setup(struct rbd_device *rbd_dev) { int ret; @@ -4800,6 +4807,15 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) return 0; } +static void rbd_dev_image_release(struct rbd_device *rbd_dev) +{ + rbd_header_free(&rbd_dev->header); + rbd_assert(rbd_dev->rbd_client != NULL); + rbd_spec_put(rbd_dev->parent_spec); + kfree(rbd_dev->header_name); + rbd_dev_destroy(rbd_dev); +} + /* * Probe for the existence of the header object for the given rbd * device. For format 2 images this includes determining the image @@ -4848,7 +4864,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) if (ret) goto err_out_snaps; - ret = rbd_dev_probe_finish(rbd_dev); + ret = rbd_dev_device_setup(rbd_dev); if (ret) goto err_out_parent; @@ -4968,24 +4984,19 @@ static struct rbd_device *__rbd_get_dev(unsigned long dev_id) return NULL; } -static void rbd_dev_release(struct device *dev) +static void rbd_dev_device_release(struct device *dev) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - /* clean up and free blkdev */ rbd_free_disk(rbd_dev); + clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); + rbd_dev_clear_mapping(rbd_dev); unregister_blkdev(rbd_dev->major, rbd_dev->name); - - /* release allocated disk header fields */ - rbd_header_free(&rbd_dev->header); - - /* done with the id, and with the rbd_dev */ + rbd_dev->major = 0; rbd_dev_id_put(rbd_dev); rbd_dev_mapping_clear(rbd_dev); - rbd_assert(rbd_dev->rbd_client != NULL); - rbd_spec_put(rbd_dev->parent_spec); - kfree(rbd_dev->header_name); - rbd_dev_destroy(rbd_dev); + + rbd_dev_image_release(rbd_dev); } static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) -- cgit v1.2.3 From 6fd48b3be9f6d195a970b92040d097b5b886a99b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 28 Apr 2013 23:32:34 -0500 Subject: rbd: define rbd_dev_unprobe() Define a new function rbd_dev_unprobe() which undoes state changes that occur from calling rbd_dev_v1_probe() or rbd_dev_v2_probe(). Note that this is a superset of rbd_header_free(), which is now getting removed (it seems to have been used improperly anyway). Flesh out rbd_dev_image_release() so it undoes exactly what rbd_dev_image_probe() does. This means that: - rbd_dev_device_release() gets called when the last device reference gets dropped; - that undoes everything done by the rbd_dev_device_setup() call at the end of rbd_dev_image_probe() (and nothing more), ending by calling rbd_dev_image_release(); and - rbd_dev_image_release() undoes everything else done by rbd_dev_image_probe() (and this includes a call to rbd_dev_unprobe(). This means the image and device portions of an rbd device are fairly cleanly separated now, so error paths should be a little easier to verify than they used to be. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 78 ++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 40 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index feaa2e9192a1..408e29f102c8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -900,18 +900,6 @@ static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev) rbd_dev->mapping.read_only = true; } -static void rbd_header_free(struct rbd_image_header *header) -{ - kfree(header->object_prefix); - header->object_prefix = NULL; - kfree(header->snap_sizes); - header->snap_sizes = NULL; - kfree(header->snap_names); - header->snap_names = NULL; - rbd_snap_context_put(header->snapc); - header->snapc = NULL; -} - static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) { char *name; @@ -4588,6 +4576,27 @@ out: return ret; } +/* Undo whatever state changes are made by v1 or v2 image probe */ + +static void rbd_dev_unprobe(struct rbd_device *rbd_dev) +{ + struct rbd_image_header *header; + + rbd_dev_remove_parent(rbd_dev); + rbd_spec_put(rbd_dev->parent_spec); + rbd_dev->parent_spec = NULL; + rbd_dev->parent_overlap = 0; + + /* Free dynamic fields from the header, then zero it out */ + + header = &rbd_dev->header; + rbd_snap_context_put(header->snapc); + kfree(header->snap_sizes); + kfree(header->snap_names); + kfree(header->object_prefix); + memset(header, 0, sizeof (*header)); +} + static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) { int ret; @@ -4809,10 +4818,19 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) static void rbd_dev_image_release(struct rbd_device *rbd_dev) { - rbd_header_free(&rbd_dev->header); - rbd_assert(rbd_dev->rbd_client != NULL); - rbd_spec_put(rbd_dev->parent_spec); + int ret; + + rbd_remove_all_snaps(rbd_dev); + rbd_dev_unprobe(rbd_dev); + ret = rbd_dev_header_watch_sync(rbd_dev, 0); + if (ret) + rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); kfree(rbd_dev->header_name); + rbd_dev->header_name = NULL; + rbd_dev->image_format = 0; + kfree(rbd_dev->spec->image_id); + rbd_dev->spec->image_id = NULL; + rbd_dev_destroy(rbd_dev); } @@ -4854,7 +4872,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) ret = rbd_dev_snaps_update(rbd_dev); if (ret) - goto err_out_watch; + goto err_out_probe; ret = rbd_dev_spec_update(rbd_dev); if (ret) @@ -4865,15 +4883,13 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) goto err_out_snaps; ret = rbd_dev_device_setup(rbd_dev); - if (ret) - goto err_out_parent; + if (!ret) + return 0; - return ret; -err_out_parent: - rbd_dev_remove_parent(rbd_dev); - rbd_header_free(&rbd_dev->header); err_out_snaps: rbd_remove_all_snaps(rbd_dev); +err_out_probe: + rbd_dev_unprobe(rbd_dev); err_out_watch: tmp = rbd_dev_header_watch_sync(rbd_dev, 0); if (tmp) @@ -5005,7 +5021,6 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) struct rbd_device *first = rbd_dev; struct rbd_device *second = first->parent; struct rbd_device *third; - int ret; /* * Follow to the parent with no grandparent and @@ -5016,11 +5031,6 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) second = third; } rbd_assert(second); - ret = rbd_dev_header_watch_sync(rbd_dev, 0); - if (ret) - rbd_warn(rbd_dev, - "failed to cancel watch event (%d)\n", ret); - rbd_remove_all_snaps(second); rbd_bus_del_dev(second); first->parent = NULL; first->parent_overlap = 0; @@ -5065,19 +5075,7 @@ static ssize_t rbd_remove(struct bus_type *bus, spin_unlock_irq(&rbd_dev->lock); if (ret < 0) goto done; - - ret = rbd_dev_header_watch_sync(rbd_dev, 0); - if (ret) { - rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); - clear_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); - smp_mb(); - return ret; - } ret = count; - - rbd_dev_remove_parent(rbd_dev); - - rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); module_put(THIS_MODULE); done: -- cgit v1.2.3 From 8ad42cd0c002fa278f6d0135e22fcb188e400a28 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 28 Apr 2013 23:32:34 -0500 Subject: rbd: don't have device release destroy rbd_dev Currently an rbd_device structure gets destroyed from the release routine for the device embedded within it. Stop doing that, instead calling rbd_dev_image_release() right after rbd_bus_del_dev() wherever the latter is called. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 408e29f102c8..57e56617e45f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5011,8 +5011,6 @@ static void rbd_dev_device_release(struct device *dev) rbd_dev->major = 0; rbd_dev_id_put(rbd_dev); rbd_dev_mapping_clear(rbd_dev); - - rbd_dev_image_release(rbd_dev); } static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) @@ -5032,6 +5030,7 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) } rbd_assert(second); rbd_bus_del_dev(second); + rbd_dev_image_release(second); first->parent = NULL; first->parent_overlap = 0; @@ -5077,6 +5076,7 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; ret = count; rbd_bus_del_dev(rbd_dev); + rbd_dev_image_release(rbd_dev); module_put(THIS_MODULE); done: mutex_unlock(&ctl_mutex); -- cgit v1.2.3 From b536f69a3a589113992c32982bf2981c8225c9da Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 28 Apr 2013 23:32:34 -0500 Subject: rbd: set up devices only for mapped images Stop setting up Linux devices during the image probe operation. Instead, set up the devices as a separate step after the image probe, in rbd_add(). A consequence of this is that only mapped images get devices assigned to them, which is pretty sweet. This resolves: http://tracker.ceph.com/issues/4774 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 57e56617e45f..d41f97690343 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4879,10 +4879,6 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) goto err_out_snaps; ret = rbd_dev_probe_parent(rbd_dev); - if (ret) - goto err_out_snaps; - - ret = rbd_dev_device_setup(rbd_dev); if (!ret) return 0; @@ -4964,9 +4960,12 @@ static ssize_t rbd_add(struct bus_type *bus, if (rc < 0) goto err_out_rbd_dev; - return count; + rc = rbd_dev_device_setup(rbd_dev); + if (!rc) + return count; + + rbd_dev_image_release(rbd_dev); err_out_rbd_dev: - kfree(rbd_dev->header_name); rbd_dev_destroy(rbd_dev); err_out_client: rbd_put_client(rbdc); @@ -5029,7 +5028,6 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) second = third; } rbd_assert(second); - rbd_bus_del_dev(second); rbd_dev_image_release(second); first->parent = NULL; first->parent_overlap = 0; -- cgit v1.2.3 From 812164f8c3f6f5348aa69003a2f81775c2872ac0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:32 -0500 Subject: ceph: use ceph_create_snap_context() Now that we have a library routine to create snap contexts, use it. This is part of: http://tracker.ceph.com/issues/4857 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 41 ++++++----------------------------------- fs/ceph/snap.c | 3 +-- 2 files changed, 7 insertions(+), 37 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d41f97690343..5c1c38dc0b51 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -672,35 +672,6 @@ static void rbd_client_release(struct kref *kref) kfree(rbdc); } -/* Caller has to fill in snapc->seq and snapc->snaps[0..snap_count-1] */ - -static struct ceph_snap_context *rbd_snap_context_create(u32 snap_count) -{ - struct ceph_snap_context *snapc; - size_t size; - - size = sizeof (struct ceph_snap_context); - size += snap_count * sizeof (snapc->snaps[0]); - snapc = kzalloc(size, GFP_KERNEL); - if (!snapc) - return NULL; - - atomic_set(&snapc->nref, 1); - snapc->num_snaps = snap_count; - - return snapc; -} - -static inline void rbd_snap_context_get(struct ceph_snap_context *snapc) -{ - (void)ceph_get_snap_context(snapc); -} - -static inline void rbd_snap_context_put(struct ceph_snap_context *snapc) -{ - ceph_put_snap_context(snapc); -} - /* * Drop reference to ceph client node. If it's not referenced anymore, release * it. @@ -820,7 +791,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->image_size = le64_to_cpu(ondisk->image_size); - header->snapc = rbd_snap_context_create(snap_count); + header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); if (!header->snapc) goto out_err; header->snapc->seq = le64_to_cpu(ondisk->snap_seq); @@ -1753,7 +1724,7 @@ static struct rbd_img_request *rbd_img_request_create( if (write_request) { down_read(&rbd_dev->header_rwsem); - rbd_snap_context_get(rbd_dev->header.snapc); + ceph_get_snap_context(rbd_dev->header.snapc); up_read(&rbd_dev->header_rwsem); } @@ -1805,7 +1776,7 @@ static void rbd_img_request_destroy(struct kref *kref) rbd_assert(img_request->obj_request_count == 0); if (img_request_write_test(img_request)) - rbd_snap_context_put(img_request->snapc); + ceph_put_snap_context(img_request->snapc); if (img_request_child_test(img_request)) rbd_obj_request_put(img_request->obj_request); @@ -3071,7 +3042,7 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) kfree(rbd_dev->header.snap_sizes); kfree(rbd_dev->header.snap_names); /* osd requests may still refer to snapc */ - rbd_snap_context_put(rbd_dev->header.snapc); + ceph_put_snap_context(rbd_dev->header.snapc); if (hver) *hver = h.obj_version; @@ -3914,7 +3885,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) goto out; ret = 0; - snapc = rbd_snap_context_create(snap_count); + snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); if (!snapc) { ret = -ENOMEM; goto out; @@ -4590,7 +4561,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) /* Free dynamic fields from the header, then zero it out */ header = &rbd_dev->header; - rbd_snap_context_put(header->snapc); + ceph_put_snap_context(header->snapc); kfree(header->snap_sizes); kfree(header->snap_names); kfree(header->object_prefix); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index cbb2f54a3019..f01645a27752 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -332,10 +332,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) err = -ENOMEM; if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail; - snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); + snapc = ceph_create_snap_context(num, GFP_NOFS); if (!snapc) goto fail; - atomic_set(&snapc->nref, 1); /* build (reverse sorted) snap vector */ num = 0; -- cgit v1.2.3 From 96882f55c40dcb4cd80b81a4374fdd297109ec98 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:32 -0500 Subject: rbd: fix up the layering warning message A warning gets spewed for any image being probed, including parent images. Set up a condition such that the warning message only gets printed for the image being mapped, not any of its parents. Also, I didn't like the way the warning ended up being so long. Make it a terse warning instead. People experimenting with layering will know what the message means. This is part of: http://tracker.ceph.com/issues/4867 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5c1c38dc0b51..71e2de2cff22 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4624,8 +4624,15 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) ret = rbd_dev_v2_parent_info(rbd_dev); if (ret) goto out_err; - rbd_warn(rbd_dev, "WARNING: kernel support for " - "layered rbd images is EXPERIMENTAL!"); + + /* + * Don't print a warning for parent images. We can + * tell this point because we won't know its pool + * name yet (just its pool id). + */ + if (rbd_dev->spec->pool_name) + rbd_warn(rbd_dev, "WARNING: kernel layering " + "is EXPERIMENTAL!"); } /* If the image supports fancy striping, get its parameters */ -- cgit v1.2.3 From a3fbe5d447bf1f63efa7f4d8c222002ef136cf4b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:32 -0500 Subject: rbd: don't revalidate so much Whenever a header object event causes a mapped rbd image to refresh its header information, revalidate_disk() is being called. This was done in rbd_dev_refresh() outside the control mutex in order to avoid a lock inversion. Although a an event like this *might* indicate the image has changed size, most of the time it does not. Record the image size before and after the refresh, and only call revalidate_disk() if it changes. This resolves: http://tracker.ceph.com/issues/4867 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 71e2de2cff22..ab2c788a22ad 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3065,19 +3065,22 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) { + u64 image_size; int ret; rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + image_size = rbd_dev->header.image_size; mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); if (rbd_dev->image_format == 1) ret = rbd_dev_v1_refresh(rbd_dev, hver); else ret = rbd_dev_v2_refresh(rbd_dev, hver); mutex_unlock(&ctl_mutex); - revalidate_disk(rbd_dev->disk); if (ret) rbd_warn(rbd_dev, "got notification but failed to " " update snaps: %d\n", ret); + if (image_size != rbd_dev->header.image_size) + revalidate_disk(rbd_dev->disk); return ret; } -- cgit v1.2.3 From cb75223d2b19161e8d916049673cd297cce43cdd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:33 -0500 Subject: rbd: snap names are pointer to constant data Make explicit that snapshot names don't change by making functions return and take parameters that that point to const qualified data. This resolves: http://tracker.ceph.com/issues/4867 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ab2c788a22ad..4be3b2a1be8e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3435,10 +3435,10 @@ static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev, * Returns a dynamically-allocated snapshot name if successful, or a * pointer-coded error otherwise. */ -static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, +static const char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, u64 *snap_size, u64 *snap_features) { - char *snap_name; + const char *snap_name; int i; rbd_assert(which < rbd_dev->header.snapc->num_snaps); @@ -3907,7 +3907,7 @@ out: return ret; } -static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) +static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) { size_t size; void *reply_buf; @@ -3948,13 +3948,13 @@ out: return snap_name; } -static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, +static const char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, u64 *snap_size, u64 *snap_features) { u64 snap_id; u64 size; u64 features; - char *snap_name; + const char *snap_name; int ret; rbd_assert(which < rbd_dev->header.snapc->num_snaps); @@ -3978,7 +3978,7 @@ out_err: return ERR_PTR(ret); } -static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, +static const char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, u64 *snap_size, u64 *snap_features) { if (rbd_dev->image_format == 1) @@ -4045,7 +4045,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) while (index < snap_count || links != head) { u64 snap_id; struct rbd_snap *snap; - char *snap_name; + const char *snap_name; u64 snap_size = 0; u64 snap_features = 0; -- cgit v1.2.3 From b21ebdddeb2aa86677dc7d0e3cf6918cac08f92c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:32 -0500 Subject: rbd: stop tracking header object version The rbd code takes care to maintain the version of the header object. This was done in hopes of using it to detect a change in the object between reading it and setting up a watch request to be notified of changes. The mechanism was never fully implemented, however. And we now avoid the original problem by setting up the watch request before ever reading the content of the header. The osd doesn't interpret the object version supplied with a WATCH osd op, nor does it use the version supplied with a NOTIFY_ACK op (we can just supply 0 for both). There is therefore no need to maintain the header's object version any more, so stop doing so. We'll be able to simplify some more rbd code in the next few patches as a result of this. This resolves: http://tracker.ceph.com/issues/3952 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4be3b2a1be8e..8875bebbacfc 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -110,8 +110,6 @@ struct rbd_image_header { u64 stripe_unit; u64 stripe_count; - - u64 obj_version; }; /* @@ -2554,8 +2552,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) rbd_dev->watch_request->osd_req); osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, - rbd_dev->header.obj_version, start); + rbd_dev->watch_event->cookie, 0, start); rbd_osd_req_format_write(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); @@ -2987,8 +2984,6 @@ static int rbd_read_header(struct rbd_device *rbd_dev, if (IS_ERR(ondisk)) return PTR_ERR(ondisk); ret = rbd_header_from_disk(header, ondisk); - if (ret >= 0) - header->obj_version = ver; kfree(ondisk); return ret; @@ -3044,9 +3039,6 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) /* osd requests may still refer to snapc */ ceph_put_snap_context(rbd_dev->header.snapc); - if (hver) - *hver = h.obj_version; - rbd_dev->header.obj_version = h.obj_version; rbd_dev->header.image_size = h.image_size; rbd_dev->header.snapc = h.snapc; rbd_dev->header.snap_names = h.snap_names; @@ -4656,7 +4648,6 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) ret = rbd_dev_v2_snap_context(rbd_dev, &ver); if (ret) goto out_err; - rbd_dev->header.obj_version = ver; dout("discovered version 2 image, header name is %s\n", rbd_dev->header_name); -- cgit v1.2.3 From 7097f8df6e679207c949673d2959505b59a1a30e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:33 -0500 Subject: rbd: get rid of some version parameters Several functions in rbd have parameters meant to allow the version of an object to be passed in or out. The purpose of those was to allow the version of a header object to be maintained, but we no longer do that. As a result, these parameters are never actually needed or used, so get rid of them. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8875bebbacfc..77265710dd1a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2838,8 +2838,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) static int rbd_obj_read_sync(struct rbd_device *rbd_dev, const char *object_name, - u64 offset, u64 length, - void *buf, u64 *version) + u64 offset, u64 length, void *buf) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; @@ -2890,10 +2889,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); size = (size_t) obj_request->xferred; ceph_copy_from_page_vector(pages, buf, 0, size); - rbd_assert(size <= (size_t) INT_MAX); - ret = (int) size; - if (version) - *version = obj_request->version; + rbd_assert(size <= (size_t)INT_MAX); + ret = (int)size; out: if (obj_request) rbd_obj_request_put(obj_request); @@ -2914,7 +2911,7 @@ out: * Returns a pointer-coded errno if a failure occurs. */ static struct rbd_image_header_ondisk * -rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) +rbd_dev_v1_header_read(struct rbd_device *rbd_dev) { struct rbd_image_header_ondisk *ondisk = NULL; u32 snap_count = 0; @@ -2942,7 +2939,7 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) return ERR_PTR(-ENOMEM); ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, - 0, size, ondisk, version); + 0, size, ondisk); if (ret < 0) goto out_err; if ((size_t)ret < size) { @@ -2977,10 +2974,9 @@ static int rbd_read_header(struct rbd_device *rbd_dev, struct rbd_image_header *header) { struct rbd_image_header_ondisk *ondisk; - u64 ver = 0; int ret; - ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); + ondisk = rbd_dev_v1_header_read(rbd_dev); if (IS_ERR(ondisk)) return PTR_ERR(ondisk); ret = rbd_header_from_disk(header, ondisk); -- cgit v1.2.3 From cc4a38bdd587a1843540989f262feb7bdc43c468 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:33 -0500 Subject: rbd: more version parameter removal Continued from the last patch, more parameters that can go away because we no longer have a need to track object versions. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 77265710dd1a..613750933588 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -429,8 +429,8 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); static void rbd_img_parent_read(struct rbd_obj_request *obj_request); static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); -static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver); -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); +static int rbd_dev_refresh(struct rbd_device *rbd_dev); +static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev); static int rbd_open(struct block_device *bdev, fmode_t mode) { @@ -2468,8 +2468,7 @@ out_err: obj_request_done_set(obj_request); } -static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, - u64 ver, u64 notify_id) +static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id) { struct rbd_obj_request *obj_request; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; @@ -2487,7 +2486,7 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, obj_request->callback = rbd_obj_request_put; osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, - notify_id, ver, 0); + notify_id, 0, 0); rbd_osd_req_format_read(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); @@ -2501,17 +2500,16 @@ out: static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { struct rbd_device *rbd_dev = (struct rbd_device *)data; - u64 hver; if (!rbd_dev) return; dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, - rbd_dev->header_name, (unsigned long long) notify_id, - (unsigned int) opcode); - (void)rbd_dev_refresh(rbd_dev, &hver); + rbd_dev->header_name, (unsigned long long)notify_id, + (unsigned int)opcode); + (void)rbd_dev_refresh(rbd_dev); - rbd_obj_notify_ack(rbd_dev, hver, notify_id); + rbd_obj_notify_ack(rbd_dev, notify_id); } /* @@ -3014,7 +3012,7 @@ static void rbd_update_mapping_size(struct rbd_device *rbd_dev) /* * only read the first part of the ondisk header, without the snaps info */ -static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) +static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) { int ret; struct rbd_image_header h; @@ -3051,7 +3049,7 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver) return ret; } -static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) +static int rbd_dev_refresh(struct rbd_device *rbd_dev) { u64 image_size; int ret; @@ -3060,9 +3058,9 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver) image_size = rbd_dev->header.image_size; mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); if (rbd_dev->image_format == 1) - ret = rbd_dev_v1_refresh(rbd_dev, hver); + ret = rbd_dev_v1_refresh(rbd_dev); else - ret = rbd_dev_v2_refresh(rbd_dev, hver); + ret = rbd_dev_v2_refresh(rbd_dev); mutex_unlock(&ctl_mutex); if (ret) rbd_warn(rbd_dev, "got notification but failed to " @@ -3271,7 +3269,7 @@ static ssize_t rbd_image_refresh(struct device *dev, struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); int ret; - ret = rbd_dev_refresh(rbd_dev, NULL); + ret = rbd_dev_refresh(rbd_dev); return ret < 0 ? ret : size; } @@ -3824,7 +3822,7 @@ out_err: return ret; } -static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) +static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) { size_t size; int ret; @@ -3850,7 +3848,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapcontext", NULL, 0, - reply_buf, size, ver); + reply_buf, size, NULL); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -3978,7 +3976,7 @@ static const char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, return ERR_PTR(-EINVAL); } -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) +static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev) { int ret; @@ -3989,7 +3987,7 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver) goto out; rbd_update_mapping_size(rbd_dev); - ret = rbd_dev_v2_snap_context(rbd_dev, hver); + ret = rbd_dev_v2_snap_context(rbd_dev); dout("rbd_dev_v2_snap_context returned %d\n", ret); if (ret) goto out; @@ -4591,7 +4589,6 @@ out_err: static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) { int ret; - u64 ver = 0; ret = rbd_dev_v2_image_size(rbd_dev); if (ret) @@ -4641,7 +4638,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) /* Get the snapshot context, plus the header version */ - ret = rbd_dev_v2_snap_context(rbd_dev, &ver); + ret = rbd_dev_v2_snap_context(rbd_dev); if (ret) goto out_err; -- cgit v1.2.3 From e2a58ee55b0f132c2a6cbf2504a1c651b261fb67 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:33 -0500 Subject: rbd: drop rbd_obj_method_sync() version parameter Only NULL is passed as the version argument to rbd_obj_method_sync(), so get rid of it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 613750933588..1e13dffc13d5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1,3 +1,4 @@ + /* rbd.c -- Export ceph rados objects as a Linux block device @@ -2602,8 +2603,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, const void *outbound, size_t outbound_size, void *inbound, - size_t inbound_size, - u64 *version) + size_t inbound_size) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; @@ -2669,8 +2669,6 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, rbd_assert(obj_request->xferred < (u64)INT_MAX); ret = (int)obj_request->xferred; ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); - if (version) - *version = obj_request->version; out: if (obj_request) rbd_obj_request_put(obj_request); @@ -3463,7 +3461,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_size", &snapid, sizeof (snapid), - &size_buf, sizeof (size_buf), NULL); + &size_buf, sizeof (size_buf)); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -3500,7 +3498,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_object_prefix", NULL, 0, - reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); + reply_buf, RBD_OBJ_PREFIX_LEN_MAX); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -3536,7 +3534,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_features", &snapid, sizeof (snapid), - &features_buf, sizeof (features_buf), NULL); + &features_buf, sizeof (features_buf)); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -3593,7 +3591,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_parent", &snapid, sizeof (snapid), - reply_buf, size, NULL); + reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out_err; @@ -3650,7 +3648,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_stripe_unit_count", NULL, 0, - (char *)&striping_info_buf, size, NULL); + (char *)&striping_info_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -3717,7 +3715,7 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, "rbd", "dir_get_name", image_id, image_id_size, - reply_buf, size, NULL); + reply_buf, size); if (ret < 0) goto out; p = reply_buf; @@ -3848,7 +3846,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapcontext", NULL, 0, - reply_buf, size, NULL); + reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -3913,7 +3911,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapshot_name", &snap_id, sizeof (snap_id), - reply_buf, size, NULL); + reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) { snap_name = ERR_PTR(ret); @@ -4506,7 +4504,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, object_name, "rbd", "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX, NULL); + response, RBD_IMAGE_ID_LEN_MAX); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret == -ENOENT) { image_id = kstrdup("", GFP_KERNEL); -- cgit v1.2.3 From dedc81ea8468fd29bdd13eb5a362cab96b53d802 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:33 -0500 Subject: rbd: drop obj_request->version Nothing ever uses the version field maintained in the object request structure any more, so get rid of it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1e13dffc13d5..3cc080c5c49e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -225,7 +225,6 @@ struct rbd_obj_request { struct ceph_osd_request *osd_req; u64 xferred; /* bytes transferred */ - u64 version; int result; rbd_obj_callback_t callback; @@ -1486,7 +1485,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, if (osd_req->r_result < 0) obj_request->result = osd_req->r_result; - obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); BUG_ON(osd_req->r_num_ops > 2); -- cgit v1.2.3 From 9682fc6d3a8b63f58fbfc5084f32c038170cfd6b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:33 -0500 Subject: rbd: look up snapshot name in names buffer Rather than scanning the list of snapshot structures for it, scan the snapshot context buffer containing snapshot names in order to determine for a format 1 image the name associated with a given snapshot id. Pull out the part of rbd_dev_v1_snap_info() that does this scan into a new function, _rbd_dev_v1_snap_name(). Have that function return a dynamically-allocated copy of the name, and don't duplicate it in rbd_dev_v1_snap_info(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3cc080c5c49e..5d1ed184bed2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -66,6 +66,8 @@ #define RBD_SNAP_HEAD_NAME "-" +#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ + /* This allows a single page to hold an image name sent by OSD */ #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) #define RBD_IMAGE_ID_LEN_MAX 64 @@ -809,6 +811,33 @@ out_err: return -ENOMEM; } +static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) +{ + const char *snap_name; + + rbd_assert(which < rbd_dev->header.snapc->num_snaps); + + /* Skip over names until we find the one we are looking for */ + + snap_name = rbd_dev->header.snap_names; + while (which--) + snap_name += strlen(snap_name) + 1; + + return kstrdup(snap_name, GFP_KERNEL); +} + +static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) +{ + struct ceph_snap_context *snapc = rbd_dev->header.snapc; + u32 which; + + for (which = 0; which < snapc->num_snaps; which++) + if (snapc->snaps[which] == snap_id) + return which; + + return BAD_SNAP_INDEX; +} + static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) { struct rbd_snap *snap; @@ -3421,17 +3450,8 @@ static const char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, u64 *snap_size, u64 *snap_features) { const char *snap_name; - int i; - - rbd_assert(which < rbd_dev->header.snapc->num_snaps); - - /* Skip over names until we find the one we are looking for */ - snap_name = rbd_dev->header.snap_names; - for (i = 0; i < which; i++) - snap_name += strlen(snap_name) + 1; - - snap_name = kstrdup(snap_name, GFP_KERNEL); + snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); if (!snap_name) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 54cac61fb6b3bacecf5367d3838307b1dd69ace2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:33 -0500 Subject: rbd: use snap_id not index to look up snap info In order to align with what was needed for format 1 rbd images, rbd_dev_v2_snap_info() was set up to take as argument an index into the array of snapshot ids in a rbd device's snapshot context. This switches that around, so we pass the snapshot id instead. In doing this, rbd_snap_name() now returns a dynamically-allocated string rather than a fixed one, so there's no need to make a duplicate in its caller, rbd_dev_spec_update(). This means the following functions take a snapshot id where they previously used an index value: rbd_dev_snap_info() rbd_dev_v1_snap_info() rbd_dev_v2_snap_info() A new function, rbd_dev_snap_index(), determines the snap index for format 1 images and uses it to look up the name. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 68 +++++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 31 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5d1ed184bed2..eb78d575d9b2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -433,6 +433,8 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); static int rbd_dev_refresh(struct rbd_device *rbd_dev); static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev); +static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, + u64 snap_id); static int rbd_open(struct block_device *bdev, fmode_t mode) { @@ -838,18 +840,27 @@ static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) return BAD_SNAP_INDEX; } -static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) +static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u64 snap_id) { - struct rbd_snap *snap; + u32 which; + which = rbd_dev_snap_index(rbd_dev, snap_id); + if (which == BAD_SNAP_INDEX) + return NULL; + + return _rbd_dev_v1_snap_name(rbd_dev, which); +} + +static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) +{ if (snap_id == CEPH_NOSNAP) return RBD_SNAP_HEAD_NAME; - list_for_each_entry(snap, &rbd_dev->snaps, node) - if (snap_id == snap->id) - return snap->name; + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + if (rbd_dev->image_format == 1) + return rbd_dev_v1_snap_name(rbd_dev, snap_id); - return NULL; + return rbd_dev_v2_snap_name(rbd_dev, snap_id); } static struct rbd_snap *snap_by_name(struct rbd_device *rbd_dev, @@ -3446,11 +3457,15 @@ static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev, * Returns a dynamically-allocated snapshot name if successful, or a * pointer-coded error otherwise. */ -static const char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, - u64 *snap_size, u64 *snap_features) +static const char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, + u64 snap_id, u64 *snap_size, u64 *snap_features) { const char *snap_name; + u32 which; + which = rbd_dev_snap_index(rbd_dev, snap_id); + if (which == BAD_SNAP_INDEX) + return ERR_PTR(-ENOENT); snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); if (!snap_name) return ERR_PTR(-ENOMEM); @@ -3815,12 +3830,6 @@ static int rbd_dev_spec_update(struct rbd_device *rbd_dev) /* Look up the snapshot name, and make a copy */ snap_name = rbd_snap_name(rbd_dev, spec->snap_id); - if (!snap_name) { - rbd_warn(rbd_dev, "no snapshot with id %llu", spec->snap_id); - ret = -EIO; - goto out_err; - } - snap_name = kstrdup(snap_name, GFP_KERNEL); if (!snap_name) { ret = -ENOMEM; goto out_err; @@ -3909,11 +3918,12 @@ out: return ret; } -static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) +static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, + u64 snap_id) { size_t size; void *reply_buf; - __le64 snap_id; + __le64 snapid; int ret; void *p; void *end; @@ -3924,11 +3934,10 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) if (!reply_buf) return ERR_PTR(-ENOMEM); - rbd_assert(which < rbd_dev->header.snapc->num_snaps); - snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); + snapid = cpu_to_le64(snap_id); ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapshot_name", - &snap_id, sizeof (snap_id), + &snapid, sizeof (snapid), reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) { @@ -3943,24 +3952,21 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) goto out; dout(" snap_id 0x%016llx snap_name = %s\n", - (unsigned long long)le64_to_cpu(snap_id), snap_name); + (unsigned long long)snap_id, snap_name); out: kfree(reply_buf); return snap_name; } -static const char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, - u64 *snap_size, u64 *snap_features) +static const char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, + u64 snap_id, u64 *snap_size, u64 *snap_features) { - u64 snap_id; u64 size; u64 features; const char *snap_name; int ret; - rbd_assert(which < rbd_dev->header.snapc->num_snaps); - snap_id = rbd_dev->header.snapc->snaps[which]; ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); if (ret) goto out_err; @@ -3969,7 +3975,7 @@ static const char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, if (ret) goto out_err; - snap_name = rbd_dev_v2_snap_name(rbd_dev, which); + snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); if (!IS_ERR(snap_name)) { *snap_size = size; *snap_features = features; @@ -3980,14 +3986,14 @@ out_err: return ERR_PTR(ret); } -static const char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, - u64 *snap_size, u64 *snap_features) +static const char *rbd_dev_snap_info(struct rbd_device *rbd_dev, + u64 snap_id, u64 *snap_size, u64 *snap_features) { if (rbd_dev->image_format == 1) - return rbd_dev_v1_snap_info(rbd_dev, which, + return rbd_dev_v1_snap_info(rbd_dev, snap_id, snap_size, snap_features); if (rbd_dev->image_format == 2) - return rbd_dev_v2_snap_info(rbd_dev, which, + return rbd_dev_v2_snap_info(rbd_dev, snap_id, snap_size, snap_features); return ERR_PTR(-EINVAL); } @@ -4085,7 +4091,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) continue; } - snap_name = rbd_dev_snap_info(rbd_dev, index, + snap_name = rbd_dev_snap_info(rbd_dev, snap_id, &snap_size, &snap_features); if (IS_ERR(snap_name)) { ret = PTR_ERR(snap_name); -- cgit v1.2.3 From 2ad3d7167e599fb149ed370a3128140b9deabd5a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:33 -0500 Subject: rbd: define rbd_snap_size() and rbd_snap_features() This patch defines a handful of new functions that will allow us to get rid of the rbd device structure's list of snapshots. Define rbd_snap_id_by_name() to look up a snapshot id given its name. This is efficient for format 1 images but not for format 2. Fortunately it only gets called at mapping time so it's not that critical. Use rbd_snap_id_by_name() to find out the id for a snapshot getting mapped, and pass that id to new functions rbd_snap_size() and rbd_snap_features() to look up information about a given snapshot's size and feature mask given its snapshot id. All this gets done in rbd_dev_mapping_set(). As a result, snap_by_name() is no longer needed, so get rid of it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 152 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 129 insertions(+), 23 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index eb78d575d9b2..bf836dea113a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -435,6 +435,11 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev); static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev); static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u64 snap_id); +static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, + u8 *order, u64 *snap_size); +static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, + u64 *snap_features); +static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name); static int rbd_open(struct block_device *bdev, fmode_t mode) { @@ -840,7 +845,8 @@ static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) return BAD_SNAP_INDEX; } -static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u64 snap_id) +static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, + u64 snap_id) { u32 which; @@ -863,35 +869,85 @@ static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) return rbd_dev_v2_snap_name(rbd_dev, snap_id); } -static struct rbd_snap *snap_by_name(struct rbd_device *rbd_dev, - const char *snap_name) +static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, + u64 *snap_size) { - struct rbd_snap *snap; + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + if (snap_id == CEPH_NOSNAP) { + *snap_size = rbd_dev->header.image_size; + } else if (rbd_dev->image_format == 1) { + u32 which; - list_for_each_entry(snap, &rbd_dev->snaps, node) - if (!strcmp(snap_name, snap->name)) - return snap; + which = rbd_dev_snap_index(rbd_dev, snap_id); + if (which == BAD_SNAP_INDEX) + return -ENOENT; - return NULL; + *snap_size = rbd_dev->header.snap_sizes[which]; + } else { + u64 size = 0; + int ret; + + ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); + if (ret) + return ret; + + *snap_size = size; + } + return 0; } -static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) +static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, + u64 *snap_features) { - if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME, - sizeof (RBD_SNAP_HEAD_NAME))) { - rbd_dev->mapping.size = rbd_dev->header.image_size; - rbd_dev->mapping.features = rbd_dev->header.features; + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + if (snap_id == CEPH_NOSNAP) { + *snap_features = rbd_dev->header.features; + } else if (rbd_dev->image_format == 1) { + *snap_features = 0; /* No features for format 1 */ } else { - struct rbd_snap *snap; + u64 features = 0; + int ret; + + ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); + if (ret) + return ret; + + *snap_features = features; + } + return 0; +} - snap = snap_by_name(rbd_dev, rbd_dev->spec->snap_name); - if (!snap) +static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) +{ + const char *snap_name = rbd_dev->spec->snap_name; + u64 snap_id; + u64 size = 0; + u64 features = 0; + int ret; + + if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) { + snap_id = rbd_snap_id_by_name(rbd_dev, snap_name); + if (snap_id == CEPH_NOSNAP) return -ENOENT; - rbd_dev->mapping.size = snap->size; - rbd_dev->mapping.features = snap->features; - rbd_dev->mapping.read_only = true; + } else { + snap_id = CEPH_NOSNAP; } + ret = rbd_snap_size(rbd_dev, snap_id, &size); + if (ret) + return ret; + ret = rbd_snap_features(rbd_dev, snap_id, &features); + if (ret) + return ret; + + rbd_dev->mapping.size = size; + rbd_dev->mapping.features = features; + + /* If we are mapping a snapshot it must be marked read-only */ + + if (snap_id != CEPH_NOSNAP) + rbd_dev->mapping.read_only = true; + return 0; } @@ -3766,6 +3822,56 @@ out: return image_name; } +static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) +{ + struct ceph_snap_context *snapc = rbd_dev->header.snapc; + const char *snap_name; + u32 which = 0; + + /* Skip over names until we find the one we are looking for */ + + snap_name = rbd_dev->header.snap_names; + while (which < snapc->num_snaps) { + if (!strcmp(name, snap_name)) + return snapc->snaps[which]; + snap_name += strlen(snap_name) + 1; + which++; + } + return CEPH_NOSNAP; +} + +static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) +{ + struct ceph_snap_context *snapc = rbd_dev->header.snapc; + u32 which; + bool found = false; + u64 snap_id; + + for (which = 0; !found && which < snapc->num_snaps; which++) { + const char *snap_name; + + snap_id = snapc->snaps[which]; + snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); + if (IS_ERR(snap_name)) + break; + found = !strcmp(name, snap_name); + kfree(snap_name); + } + return found ? snap_id : CEPH_NOSNAP; +} + +/* + * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if + * no snapshot by that name is found, or if an error occurs. + */ +static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) +{ + if (rbd_dev->image_format == 1) + return rbd_v1_snap_id_by_name(rbd_dev, name); + + return rbd_v2_snap_id_by_name(rbd_dev, name); +} + /* * When an rbd image has a parent image, it is identified by the * pool, image, and snapshot ids (not names). This function fills @@ -3797,12 +3903,12 @@ static int rbd_dev_spec_update(struct rbd_device *rbd_dev) */ if (spec->pool_name) { if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { - struct rbd_snap *snap; + u64 snap_id; - snap = snap_by_name(rbd_dev, spec->snap_name); - if (!snap) + snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); + if (snap_id == CEPH_NOSNAP) return -ENOENT; - spec->snap_id = snap->id; + spec->snap_id = snap_id; } else { spec->snap_id = CEPH_NOSNAP; } -- cgit v1.2.3 From 33dca39f5c0c750d37d3d89ce8ae66be08280a45 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:33 -0500 Subject: rbd: kill off the snapshot list We no longer use the snapshot list for anything. When we need to look up a snapshot name, id, size, or feature mask, we just do it directly rather than relying on this list being updated with every refresh. The main reason it existed was for the benefit of the device/sysfs entries that previously were associated with snapshots. So get rid of the snapshot list, and struct rbd_snap, and the hundreds of lines of code that supported them. This resolves: http://tracker.ceph.com/issues/4868 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 257 +--------------------------------------------------- 1 file changed, 1 insertion(+), 256 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bf836dea113a..0ca959f5c934 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -274,14 +274,6 @@ struct rbd_img_request { #define for_each_obj_request_safe(ireq, oreq, n) \ list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) -struct rbd_snap { - const char *name; - u64 size; - struct list_head node; - u64 id; - u64 features; -}; - struct rbd_mapping { u64 size; u64 features; @@ -326,9 +318,6 @@ struct rbd_device { struct list_head node; - /* list of snapshots */ - struct list_head snaps; - /* sysfs related */ struct device dev; unsigned long open_count; /* protected by lock */ @@ -356,10 +345,7 @@ static DEFINE_SPINLOCK(rbd_client_list_lock); static int rbd_img_request_submit(struct rbd_img_request *img_request); -static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); - static void rbd_dev_device_release(struct device *dev); -static void rbd_snap_destroy(struct rbd_snap *snap); static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); @@ -3075,17 +3061,6 @@ static int rbd_read_header(struct rbd_device *rbd_dev, return ret; } -static void rbd_remove_all_snaps(struct rbd_device *rbd_dev) -{ - struct rbd_snap *snap; - struct rbd_snap *next; - - list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) { - list_del(&snap->node); - rbd_snap_destroy(snap); - } -} - static void rbd_update_mapping_size(struct rbd_device *rbd_dev) { if (rbd_dev->spec->snap_id != CEPH_NOSNAP) @@ -3134,8 +3109,6 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) rbd_warn(rbd_dev, "object prefix changed (ignoring)"); kfree(h.object_prefix); - ret = rbd_dev_snaps_update(rbd_dev); - up_write(&rbd_dev->header_rwsem); return ret; @@ -3461,7 +3434,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, spin_lock_init(&rbd_dev->lock); rbd_dev->flags = 0; INIT_LIST_HEAD(&rbd_dev->node); - INIT_LIST_HEAD(&rbd_dev->snaps); init_rwsem(&rbd_dev->header_rwsem); rbd_dev->spec = spec; @@ -3484,54 +3456,6 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev) kfree(rbd_dev); } -static void rbd_snap_destroy(struct rbd_snap *snap) -{ - kfree(snap->name); - kfree(snap); -} - -static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev, - const char *snap_name, - u64 snap_id, u64 snap_size, - u64 snap_features) -{ - struct rbd_snap *snap; - - snap = kzalloc(sizeof (*snap), GFP_KERNEL); - if (!snap) - return ERR_PTR(-ENOMEM); - - snap->name = snap_name; - snap->id = snap_id; - snap->size = snap_size; - snap->features = snap_features; - - return snap; -} - -/* - * Returns a dynamically-allocated snapshot name if successful, or a - * pointer-coded error otherwise. - */ -static const char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, - u64 snap_id, u64 *snap_size, u64 *snap_features) -{ - const char *snap_name; - u32 which; - - which = rbd_dev_snap_index(rbd_dev, snap_id); - if (which == BAD_SNAP_INDEX) - return ERR_PTR(-ENOENT); - snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); - if (!snap_name) - return ERR_PTR(-ENOMEM); - - *snap_size = rbd_dev->header.snap_sizes[which]; - *snap_features = 0; /* No features for v1 */ - - return snap_name; -} - /* * Get the size and object order for an image snapshot, or if * snap_id is CEPH_NOSNAP, gets this information for the base @@ -3883,10 +3807,6 @@ static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) * When an image being mapped (not a parent) is probed, we have the * pool name and pool id, image name and image id, and the snapshot * name. The only thing we're missing is the snapshot id. - * - * The set of snapshots for an image is not known until they have - * been read by rbd_dev_snaps_update(), so we can't completely fill - * in this information until after that has been called. */ static int rbd_dev_spec_update(struct rbd_device *rbd_dev) { @@ -4065,45 +3985,6 @@ out: return snap_name; } -static const char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, - u64 snap_id, u64 *snap_size, u64 *snap_features) -{ - u64 size; - u64 features; - const char *snap_name; - int ret; - - ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); - if (ret) - goto out_err; - - ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); - if (ret) - goto out_err; - - snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); - if (!IS_ERR(snap_name)) { - *snap_size = size; - *snap_features = features; - } - - return snap_name; -out_err: - return ERR_PTR(ret); -} - -static const char *rbd_dev_snap_info(struct rbd_device *rbd_dev, - u64 snap_id, u64 *snap_size, u64 *snap_features) -{ - if (rbd_dev->image_format == 1) - return rbd_dev_v1_snap_info(rbd_dev, snap_id, - snap_size, snap_features); - if (rbd_dev->image_format == 2) - return rbd_dev_v2_snap_info(rbd_dev, snap_id, - snap_size, snap_features); - return ERR_PTR(-EINVAL); -} - static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev) { int ret; @@ -4119,141 +4000,12 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev) dout("rbd_dev_v2_snap_context returned %d\n", ret); if (ret) goto out; - ret = rbd_dev_snaps_update(rbd_dev); - dout("rbd_dev_snaps_update returned %d\n", ret); - if (ret) - goto out; out: up_write(&rbd_dev->header_rwsem); return ret; } -/* - * Scan the rbd device's current snapshot list and compare it to the - * newly-received snapshot context. Remove any existing snapshots - * not present in the new snapshot context. Add a new snapshot for - * any snaphots in the snapshot context not in the current list. - * And verify there are no changes to snapshots we already know - * about. - * - * Assumes the snapshots in the snapshot context are sorted by - * snapshot id, highest id first. (Snapshots in the rbd_dev's list - * are also maintained in that order.) - * - * Note that any error occurs while updating the snapshot list - * aborts the update, and the entire list is cleared. The snapshot - * list becomes inconsistent at that point anyway, so it might as - * well be empty. - */ -static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) -{ - struct ceph_snap_context *snapc = rbd_dev->header.snapc; - const u32 snap_count = snapc->num_snaps; - struct list_head *head = &rbd_dev->snaps; - struct list_head *links = head->next; - u32 index = 0; - int ret = 0; - - dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count); - while (index < snap_count || links != head) { - u64 snap_id; - struct rbd_snap *snap; - const char *snap_name; - u64 snap_size = 0; - u64 snap_features = 0; - - snap_id = index < snap_count ? snapc->snaps[index] - : CEPH_NOSNAP; - snap = links != head ? list_entry(links, struct rbd_snap, node) - : NULL; - rbd_assert(!snap || snap->id != CEPH_NOSNAP); - - if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { - struct list_head *next = links->next; - - /* - * A previously-existing snapshot is not in - * the new snap context. - * - * If the now-missing snapshot is the one - * the image represents, clear its existence - * flag so we can avoid sending any more - * requests to it. - */ - if (rbd_dev->spec->snap_id == snap->id) - clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); - dout("removing %ssnap id %llu\n", - rbd_dev->spec->snap_id == snap->id ? - "mapped " : "", - (unsigned long long)snap->id); - - list_del(&snap->node); - rbd_snap_destroy(snap); - - /* Done with this list entry; advance */ - - links = next; - continue; - } - - snap_name = rbd_dev_snap_info(rbd_dev, snap_id, - &snap_size, &snap_features); - if (IS_ERR(snap_name)) { - ret = PTR_ERR(snap_name); - dout("failed to get snap info, error %d\n", ret); - goto out_err; - } - - dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count, - (unsigned long long)snap_id); - if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { - struct rbd_snap *new_snap; - - /* We haven't seen this snapshot before */ - - new_snap = rbd_snap_create(rbd_dev, snap_name, - snap_id, snap_size, snap_features); - if (IS_ERR(new_snap)) { - ret = PTR_ERR(new_snap); - dout(" failed to add dev, error %d\n", ret); - goto out_err; - } - - /* New goes before existing, or at end of list */ - - dout(" added dev%s\n", snap ? "" : " at end\n"); - if (snap) - list_add_tail(&new_snap->node, &snap->node); - else - list_add_tail(&new_snap->node, head); - } else { - /* Already have this one */ - - dout(" already present\n"); - - rbd_assert(snap->size == snap_size); - rbd_assert(!strcmp(snap->name, snap_name)); - rbd_assert(snap->features == snap_features); - - /* Done with this list entry; advance */ - - links = links->next; - } - - /* Advance to the next entry in the snapshot context */ - - index++; - } - dout("%s: done\n", __func__); - - return 0; -out_err: - rbd_remove_all_snaps(rbd_dev); - - return ret; -} - static int rbd_bus_add_dev(struct rbd_device *rbd_dev) { struct device *dev; @@ -4913,7 +4665,6 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) { int ret; - rbd_remove_all_snaps(rbd_dev); rbd_dev_unprobe(rbd_dev); ret = rbd_dev_header_watch_sync(rbd_dev, 0); if (ret) @@ -4963,20 +4714,14 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev) if (ret) goto err_out_watch; - ret = rbd_dev_snaps_update(rbd_dev); - if (ret) - goto err_out_probe; - ret = rbd_dev_spec_update(rbd_dev); if (ret) - goto err_out_snaps; + goto err_out_probe; ret = rbd_dev_probe_parent(rbd_dev); if (!ret) return 0; -err_out_snaps: - rbd_remove_all_snaps(rbd_dev); err_out_probe: rbd_dev_unprobe(rbd_dev); err_out_watch: -- cgit v1.2.3 From 15228ede7d9437b0dcfe9331c9830b3646fdadf7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 May 2013 12:43:03 -0500 Subject: rbd: clear EXISTS flag if mapped snapshot disappears This functionality inadvertently disappeared in the last patch. Image snapshots can get removed at just about any time. In particular it can disappear even if it is in use by an rbd client as a mapped image. The rbd client deals with such a disappearance by responding to new requests with ENXIO. This is implemented by each rbd device maintaining an EXISTS flag, which is normally set but cleared if a snapshot disappears. This patch (re-)implements the clearing of that flag. Whenever mapped image header information is refreshed, if the mapping is for a snapshot, verify the mapped snapshot is still present in the updated snapshot context. If it is not, clear the flag. It is not necessary to check this in the initial probe, because the probe will not succeed if the snapshot doesn't exist. This resolves: http://tracker.ceph.com/issues/4880 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0ca959f5c934..3f58aba6461f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3114,6 +3114,25 @@ static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev) return ret; } +/* + * Clear the rbd device's EXISTS flag if the snapshot it's mapped to + * has disappeared from the (just updated) snapshot context. + */ +static void rbd_exists_validate(struct rbd_device *rbd_dev) +{ + u64 snap_id; + + if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) + return; + + snap_id = rbd_dev->spec->snap_id; + if (snap_id == CEPH_NOSNAP) + return; + + if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) + clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); +} + static int rbd_dev_refresh(struct rbd_device *rbd_dev) { u64 image_size; @@ -3126,6 +3145,10 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) ret = rbd_dev_v1_refresh(rbd_dev); else ret = rbd_dev_v2_refresh(rbd_dev); + + /* If it's a mapped snapshot, validate its EXISTS flag */ + + rbd_exists_validate(rbd_dev); mutex_unlock(&ctl_mutex); if (ret) rbd_warn(rbd_dev, "got notification but failed to " -- cgit v1.2.3 From 30d1cff817808fca9801c743d2de4c61f3f38e15 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 May 2013 12:43:03 -0500 Subject: rbd: use binary search for snapshot lookup Use bsearch(3) to make snapshot lookup by id more efficient. (There could be thousands of snapshots, and conceivably many more.) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3f58aba6461f..82d9586a4172 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -819,16 +820,39 @@ static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) return kstrdup(snap_name, GFP_KERNEL); } +/* + * Snapshot id comparison function for use with qsort()/bsearch(). + * Note that result is for snapshots in *descending* order. + */ +static int snapid_compare_reverse(const void *s1, const void *s2) +{ + u64 snap_id1 = *(u64 *)s1; + u64 snap_id2 = *(u64 *)s2; + + if (snap_id1 < snap_id2) + return 1; + return snap_id1 == snap_id2 ? 0 : -1; +} + +/* + * Search a snapshot context to see if the given snapshot id is + * present. + * + * Returns the position of the snapshot id in the array if it's found, + * or BAD_SNAP_INDEX otherwise. + * + * Note: The snapshot array is in kept sorted (by the osd) in + * reverse order, highest snapshot id first. + */ static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) { struct ceph_snap_context *snapc = rbd_dev->header.snapc; - u32 which; + u64 *found; - for (which = 0; which < snapc->num_snaps; which++) - if (snapc->snaps[which] == snap_id) - return which; + found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, + sizeof (snap_id), snapid_compare_reverse); - return BAD_SNAP_INDEX; + return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; } static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, -- cgit v1.2.3 From 1c2a9dfe2107e81b9f0ee90845c687cf7ff84106 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 May 2013 12:43:03 -0500 Subject: rbd: allocate image requests with a slab allocator Create a slab cache to manage rbd_img_request allocation. Nothing too fancy at this point--we'll still initialize everything at allocation time (no constructor) This is part of: http://tracker.ceph.com/issues/3926 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 82d9586a4172..e90abde47de0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "rbd_types.h" @@ -344,6 +345,8 @@ static DEFINE_SPINLOCK(rbd_dev_list_lock); static LIST_HEAD(rbd_client_list); /* clients */ static DEFINE_SPINLOCK(rbd_client_list_lock); +static struct kmem_cache *rbd_img_request_cache; + static int rbd_img_request_submit(struct rbd_img_request *img_request); static void rbd_dev_device_release(struct device *dev); @@ -1821,7 +1824,7 @@ static struct rbd_img_request *rbd_img_request_create( { struct rbd_img_request *img_request; - img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); + img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC); if (!img_request) return NULL; @@ -1884,7 +1887,7 @@ static void rbd_img_request_destroy(struct kref *kref) if (img_request_child_test(img_request)) rbd_obj_request_put(img_request->obj_request); - kfree(img_request); + kmem_cache_free(rbd_img_request_cache, img_request); } static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) @@ -4992,6 +4995,26 @@ static void rbd_sysfs_cleanup(void) device_unregister(&rbd_root_dev); } +static int rbd_slab_init(void) +{ + rbd_assert(!rbd_img_request_cache); + rbd_img_request_cache = kmem_cache_create("rbd_img_request", + sizeof (struct rbd_img_request), + __alignof__(struct rbd_img_request), + 0, NULL); + if (rbd_img_request_cache) + return 0; + + return -ENOMEM; +} + +static void rbd_slab_exit(void) +{ + rbd_assert(rbd_img_request_cache); + kmem_cache_destroy(rbd_img_request_cache); + rbd_img_request_cache = NULL; +} + static int __init rbd_init(void) { int rc; @@ -5001,16 +5024,22 @@ static int __init rbd_init(void) return -EINVAL; } - rc = rbd_sysfs_init(); + rc = rbd_slab_init(); if (rc) return rc; - pr_info("loaded " RBD_DRV_NAME_LONG "\n"); - return 0; + rc = rbd_sysfs_init(); + if (rc) + rbd_slab_exit(); + else + pr_info("loaded " RBD_DRV_NAME_LONG "\n"); + + return rc; } static void __exit rbd_exit(void) { rbd_sysfs_cleanup(); + rbd_slab_exit(); } module_init(rbd_init); -- cgit v1.2.3 From f907ad55967fec6bc6ec5ee84021070c49cf0bb1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 May 2013 12:43:03 -0500 Subject: rbd: allocate name separate from obj_request The next patch will define a slab allocator for a object requests. To use that we'll need to allocate the name of an object separate from the request structure itself. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e90abde47de0..d74be04ceeff 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1758,11 +1758,16 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, rbd_assert(obj_request_type_valid(type)); size = strlen(object_name) + 1; - obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); - if (!obj_request) + name = kmalloc(size, GFP_KERNEL); + if (!name) + return NULL; + + obj_request = kzalloc(sizeof (*obj_request), GFP_KERNEL); + if (!obj_request) { + kfree(name); return NULL; + } - name = (char *)(obj_request + 1); obj_request->object_name = memcpy(name, object_name, size); obj_request->offset = offset; obj_request->length = length; @@ -1808,6 +1813,7 @@ static void rbd_obj_request_destroy(struct kref *kref) break; } + kfree(obj_request->object_name); kfree(obj_request); } -- cgit v1.2.3 From 868311b1ebc9b203bae0d6d1f012ea5cbdadca03 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 May 2013 12:43:03 -0500 Subject: rbd: allocate object requests with a slab allocator Create a slab cache to manage rbd_obj_request allocation. We aren't using a constructor, and we'll zero-fill object request structures when they're allocated. This is part of: http://tracker.ceph.com/issues/3926 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d74be04ceeff..a72842aa3b53 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -346,6 +346,7 @@ static LIST_HEAD(rbd_client_list); /* clients */ static DEFINE_SPINLOCK(rbd_client_list_lock); static struct kmem_cache *rbd_img_request_cache; +static struct kmem_cache *rbd_obj_request_cache; static int rbd_img_request_submit(struct rbd_img_request *img_request); @@ -1762,7 +1763,7 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, if (!name) return NULL; - obj_request = kzalloc(sizeof (*obj_request), GFP_KERNEL); + obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL); if (!obj_request) { kfree(name); return NULL; @@ -1814,7 +1815,8 @@ static void rbd_obj_request_destroy(struct kref *kref) } kfree(obj_request->object_name); - kfree(obj_request); + obj_request->object_name = NULL; + kmem_cache_free(rbd_obj_request_cache, obj_request); } /* @@ -5008,14 +5010,29 @@ static int rbd_slab_init(void) sizeof (struct rbd_img_request), __alignof__(struct rbd_img_request), 0, NULL); - if (rbd_img_request_cache) + if (!rbd_img_request_cache) + return -ENOMEM; + + rbd_assert(!rbd_obj_request_cache); + rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", + sizeof (struct rbd_obj_request), + __alignof__(struct rbd_obj_request), + 0, NULL); + if (rbd_obj_request_cache) return 0; + kmem_cache_destroy(rbd_img_request_cache); + rbd_img_request_cache = NULL; + return -ENOMEM; } static void rbd_slab_exit(void) { + rbd_assert(rbd_obj_request_cache); + kmem_cache_destroy(rbd_obj_request_cache); + rbd_obj_request_cache = NULL; + rbd_assert(rbd_img_request_cache); kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; -- cgit v1.2.3 From 78c2a44aae2950ecf0279590572b861288714946 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 May 2013 12:43:04 -0500 Subject: rbd: allocate image object names with a slab allocator The names of objects used for image object requests are always fixed size. So create a slab cache to manage them. Define a new function rbd_segment_name_free() to match rbd_segment_name() (which is what supplies the dynamically-allocated name buffer). This is part of: http://tracker.ceph.com/issues/3926 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a72842aa3b53..390946a078be 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -345,8 +345,11 @@ static DEFINE_SPINLOCK(rbd_dev_list_lock); static LIST_HEAD(rbd_client_list); /* clients */ static DEFINE_SPINLOCK(rbd_client_list_lock); +/* Slab caches for frequently-allocated structures */ + static struct kmem_cache *rbd_img_request_cache; static struct kmem_cache *rbd_obj_request_cache; +static struct kmem_cache *rbd_segment_name_cache; static int rbd_img_request_submit(struct rbd_img_request *img_request); @@ -985,7 +988,7 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) u64 segment; int ret; - name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO); + name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); if (!name) return NULL; segment = offset >> rbd_dev->header.obj_order; @@ -1001,6 +1004,13 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) return name; } +static void rbd_segment_name_free(const char *name) +{ + /* The explicit cast here is needed to drop the const qualifier */ + + kmem_cache_free(rbd_segment_name_cache, (void *)name); +} + static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) { u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; @@ -2033,7 +2043,8 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, length = rbd_segment_length(rbd_dev, img_offset, resid); obj_request = rbd_obj_request_create(object_name, offset, length, type); - kfree(object_name); /* object request has its own copy */ + /* object request has its own copy of the object name */ + rbd_segment_name_free(object_name); if (!obj_request) goto out_unwind; @@ -5018,8 +5029,19 @@ static int rbd_slab_init(void) sizeof (struct rbd_obj_request), __alignof__(struct rbd_obj_request), 0, NULL); - if (rbd_obj_request_cache) + if (!rbd_obj_request_cache) + goto out_err; + + rbd_assert(!rbd_segment_name_cache); + rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", + MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL); + if (rbd_segment_name_cache) return 0; +out_err: + if (rbd_obj_request_cache) { + kmem_cache_destroy(rbd_obj_request_cache); + rbd_obj_request_cache = NULL; + } kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; @@ -5029,6 +5051,10 @@ static int rbd_slab_init(void) static void rbd_slab_exit(void) { + rbd_assert(rbd_segment_name_cache); + kmem_cache_destroy(rbd_segment_name_cache); + rbd_segment_name_cache = NULL; + rbd_assert(rbd_obj_request_cache); kmem_cache_destroy(rbd_obj_request_cache); rbd_obj_request_cache = NULL; -- cgit v1.2.3 From b5b09be30cf99f9c699e825629f02e3bce555d44 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 May 2013 21:37:07 -0500 Subject: rbd: fix image request leak on parent read When a read for a layered image object finds the target object doesn't exist, a read image request for the parent image is created and submitted. When that completes, the callback routine was not releasing that parent image request. Fix that. The slab allocation stuff just added has greatly simplified the search for the source of this memory leak. This resolves: http://tracker.ceph.com/issues/4803 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 390946a078be..c2ca1818f335 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2547,6 +2547,7 @@ static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) obj_request->xferred = img_request->xferred; } out: + rbd_img_request_put(img_request); rbd_img_obj_request_read_callback(obj_request); rbd_obj_request_complete(obj_request); } -- cgit v1.2.3