diff options
Diffstat (limited to 'drivers/staging/ramster/tmem.c')
-rw-r--r-- | drivers/staging/ramster/tmem.c | 313 |
1 files changed, 178 insertions, 135 deletions
diff --git a/drivers/staging/ramster/tmem.c b/drivers/staging/ramster/tmem.c index 8f2f6892d8d3..a2b7e03b6062 100644 --- a/drivers/staging/ramster/tmem.c +++ b/drivers/staging/ramster/tmem.c @@ -1,33 +1,43 @@ /* * In-kernel transcendent memory (generic implementation) * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. + * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp. * * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented * "handles" (triples containing a pool id, and object id, and an index), to * pages in a page-accessible memory (PAM). Tmem references the PAM pages via * an abstract "pampd" (PAM page-descriptor), which can be operated on by a * set of functions (pamops). Each pampd contains some representation of - * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of - * pages and must be able to insert, find, and delete these pages at a - * potential frequency of thousands per second concurrently across many CPUs, - * (and, if used with KVM, across many vcpus across many guests). - * Tmem is tracked with a hierarchy of data structures, organized by - * the elements in a handle-tuple: pool_id, object_id, and page index. - * One or more "clients" (e.g. guests) each provide one or more tmem_pools. - * Each pool, contains a hash table of rb_trees of tmem_objs. Each - * tmem_obj contains a radix-tree-like tree of pointers, with intermediate - * nodes called tmem_objnodes. Each leaf pointer in this tree points to - * a pampd, which is accessible only through a small set of callbacks - * registered by the PAM implementation (see tmem_register_pamops). Tmem - * does all memory allocation via a set of callbacks registered by the tmem - * host implementation (e.g. see tmem_register_hostops). + * PAGE_SIZE bytes worth of data. For those familiar with key-value stores, + * the tmem handle is a three-level hierarchical key, and the value is always + * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is + * referenced in the datastore by the pampd. The hierarchy is required + * to ensure that certain invalidation functions can be performed efficiently + * (i.e. flush all indexes associated with this object_id, or + * flush all objects associated with this pool). + * + * Tmem must support potentially millions of pages and must be able to insert, + * find, and delete these pages at a potential frequency of thousands per + * second concurrently across many CPUs, (and, if used with KVM, across many + * vcpus across many guests). Tmem is tracked with a hierarchy of data + * structures, organized by the elements in the handle-tuple: pool_id, + * object_id, and page index. One or more "clients" (e.g. guests) each + * provide one or more tmem_pools. Each pool, contains a hash table of + * rb_trees of tmem_objs. Each tmem_obj contains a radix-tree-like tree + * of pointers, with intermediate nodes called tmem_objnodes. Each leaf + * pointer in this tree points to a pampd, which is accessible only through + * a small set of callbacks registered by the PAM implementation (see + * tmem_register_pamops). Tmem only needs to memory allocation for objs + * and objnodes and this is done via a set of callbacks that must be + * registered by the tmem host implementation (e.g. see tmem_register_hostops). */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> +#ifdef CONFIG_RAMSTER #include <linux/delay.h> +#endif #include "tmem.h" @@ -52,7 +62,7 @@ void tmem_register_hostops(struct tmem_hostops *m) /* * A tmem host implementation must use this function to register - * callbacks for a page-accessible memory (PAM) implementation + * callbacks for a page-accessible memory (PAM) implementation. */ static struct tmem_pamops tmem_pamops; @@ -67,42 +77,62 @@ void tmem_register_pamops(struct tmem_pamops *m) * So an rb_tree is an ideal data structure to manage tmem_objs. But because * of the potentially huge number of tmem_objs, each pool manages a hashtable * of rb_trees to reduce search, insert, delete, and rebalancing time. - * Each hashbucket also has a lock to manage concurrent access. + * Each hashbucket also has a lock to manage concurrent access and no + * searches, inserts, or deletions can be performed unless the lock is held. + * As a result, care must be taken to ensure tmem routines are not called + * recursively; the vast majority of the time, a recursive call may work + * but a deadlock will occur a small fraction of the time due to the + * hashbucket lock. * - * The following routines manage tmem_objs. When any tmem_obj is accessed, - * the hashbucket lock must be held. + * The following routines manage tmem_objs. In all of these routines, + * the hashbucket lock is already held. */ -/* searches for object==oid in pool, returns locked object if found */ -static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, - struct tmem_oid *oidp) +/* Search for object==oid in pool, returns object if found. */ +static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb, + struct tmem_oid *oidp, + struct rb_node **parent, + struct rb_node ***link) { - struct rb_node *rbnode; - struct tmem_obj *obj; + struct rb_node *_parent = NULL, **rbnode; + struct tmem_obj *obj = NULL; - rbnode = hb->obj_rb_root.rb_node; - while (rbnode) { - BUG_ON(RB_EMPTY_NODE(rbnode)); - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); + rbnode = &hb->obj_rb_root.rb_node; + while (*rbnode) { + BUG_ON(RB_EMPTY_NODE(*rbnode)); + _parent = *rbnode; + obj = rb_entry(*rbnode, struct tmem_obj, + rb_tree_node); switch (tmem_oid_compare(oidp, &obj->oid)) { case 0: /* equal */ goto out; case -1: - rbnode = rbnode->rb_left; + rbnode = &(*rbnode)->rb_left; break; case 1: - rbnode = rbnode->rb_right; + rbnode = &(*rbnode)->rb_right; break; } } + + if (parent) + *parent = _parent; + if (link) + *link = rbnode; obj = NULL; out: return obj; } -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *); +static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, + struct tmem_oid *oidp) +{ + return __tmem_obj_find(hb, oidp, NULL, NULL); +} -/* free an object that has no more pampds in it */ +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool); + +/* Free an object that has no more pampds in it. */ static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) { struct tmem_pool *pool; @@ -113,7 +143,7 @@ static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) pool = obj->pool; BUG_ON(pool == NULL); if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ - tmem_pampd_destroy_all_in_obj(obj); + tmem_pampd_destroy_all_in_obj(obj, false); BUG_ON(obj->objnode_tree_root != NULL); BUG_ON((long)obj->objnode_count != 0); atomic_dec(&pool->obj_count); @@ -125,15 +155,14 @@ static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) } /* - * initialize, and insert an tmem_object_root (called only if find failed) + * Initialize, and insert an tmem_object_root (called only if find failed). */ static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, struct tmem_pool *pool, struct tmem_oid *oidp) { struct rb_root *root = &hb->obj_rb_root; - struct rb_node **new = &(root->rb_node), *parent = NULL; - struct tmem_obj *this; + struct rb_node **new = NULL, *parent = NULL; BUG_ON(pool == NULL); atomic_inc(&pool->obj_count); @@ -143,24 +172,15 @@ static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, obj->oid = *oidp; obj->objnode_count = 0; obj->pampd_count = 0; - (*tmem_pamops.new_obj)(obj); +#ifdef CONFIG_RAMSTER + if (tmem_pamops.new_obj != NULL) + (*tmem_pamops.new_obj)(obj); +#endif SET_SENTINEL(obj, OBJ); - while (*new) { - BUG_ON(RB_EMPTY_NODE(*new)); - this = rb_entry(*new, struct tmem_obj, rb_tree_node); - parent = *new; - switch (tmem_oid_compare(oidp, &this->oid)) { - case 0: - BUG(); /* already present; should never happen! */ - break; - case -1: - new = &(*new)->rb_left; - break; - case 1: - new = &(*new)->rb_right; - break; - } - } + + if (__tmem_obj_find(hb, oidp, &parent, &new)) + BUG(); + rb_link_node(&obj->rb_tree_node, parent, new); rb_insert_color(&obj->rb_tree_node, root); } @@ -170,7 +190,7 @@ static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, * "ephemeral" vs "persistent". These attributes apply to all tmem_objs * and all pampds that belong to a tmem_pool. A tmem_pool is created * or deleted relatively rarely (for example, when a filesystem is - * mounted or unmounted. + * mounted or unmounted). */ /* flush all data from a pool and, optionally, free it */ @@ -188,7 +208,7 @@ static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) while (rbnode != NULL) { obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); rbnode = rb_next(rbnode); - tmem_pampd_destroy_all_in_obj(obj); + tmem_pampd_destroy_all_in_obj(obj, true); tmem_obj_free(obj, hb); (*tmem_hostops.obj_free)(obj, pool); } @@ -274,7 +294,7 @@ static void tmem_objnode_free(struct tmem_objnode *objnode) } /* - * lookup index in object and return associated pampd (or NULL if not found) + * Lookup index in object and return associated pampd (or NULL if not found). */ static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) { @@ -316,6 +336,7 @@ static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) return slot != NULL ? *slot : NULL; } +#ifdef CONFIG_RAMSTER static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, void *new_pampd, bool no_free) { @@ -333,6 +354,7 @@ static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, } return ret; } +#endif static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, void *pampd) @@ -470,7 +492,7 @@ out: return slot; } -/* recursively walk the objnode_tree destroying pampds and objnodes */ +/* Recursively walk the objnode_tree destroying pampds and objnodes. */ static void tmem_objnode_node_destroy(struct tmem_obj *obj, struct tmem_objnode *objnode, unsigned int ht) @@ -495,7 +517,8 @@ static void tmem_objnode_node_destroy(struct tmem_obj *obj, } } -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj, + bool pool_destroy) { if (obj->objnode_tree_root == NULL) return; @@ -510,7 +533,10 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) obj->objnode_tree_height = 0; } obj->objnode_tree_root = NULL; - (*tmem_pamops.free_obj)(obj->pool, obj); +#ifdef CONFIG_RAMSTER + if (tmem_pamops.free_obj != NULL) + (*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy); +#endif } /* @@ -523,17 +549,16 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) */ /* - * "Put" a page, e.g. copy a page from the kernel into newly allocated - * PAM space (if such space is available). Tmem_put is complicated by - * a corner case: What if a page with matching handle already exists in - * tmem? To guarantee coherency, one of two actions is necessary: Either - * the data for the page must be overwritten, or the page must be - * "flushed" so that the data is not accessible to a subsequent "get". - * Since these "duplicate puts" are relatively rare, this implementation - * always flushes for simplicity. + * "Put" a page, e.g. associate the passed pampd with the passed handle. + * Tmem_put is complicated by a corner case: What if a page with matching + * handle already exists in tmem? To guarantee coherency, one of two + * actions is necessary: Either the data for the page must be overwritten, + * or the page must be "flushed" so that the data is not accessible to a + * subsequent "get". Since these "duplicate puts" are relatively rare, + * this implementation always flushes for simplicity. */ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t size, bool raw, int ephemeral) + bool raw, void *pampd_to_use) { struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; void *pampd = NULL, *pampd_del = NULL; @@ -566,19 +591,17 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, } BUG_ON(obj == NULL); BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); - pampd = (*tmem_pamops.create)(data, size, raw, ephemeral, - obj->pool, &obj->oid, index); - if (unlikely(pampd == NULL)) - goto free; + pampd = pampd_to_use; + BUG_ON(pampd_to_use == NULL); ret = tmem_pampd_add_to_obj(obj, index, pampd); if (unlikely(ret == -ENOMEM)) /* may have partially built objnode tree ("stump") */ goto delete_and_free; + (*tmem_pamops.create_finish)(pampd, is_ephemeral(pool)); goto out; delete_and_free: (void)tmem_pampd_delete_from_obj(obj, index); -free: if (pampd) (*tmem_pamops.free)(pampd, pool, NULL, 0, true); if (objnew) { @@ -590,6 +613,16 @@ out: return ret; } +#ifdef CONFIG_RAMSTER +/* + * For ramster only: The following routines provide a two-step sequence + * to allow the caller to replace a pampd in the tmem data structures with + * another pampd. Here, we lookup the passed handle and, if found, return the + * associated pampd and object, leaving the hashbucket locked and returning + * a reference to it. The caller is expected to immediately call the + * matching tmem_localify_finish routine which will handles the replacement + * and unlocks the hashbucket. + */ void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, struct tmem_obj **ret_obj, void **saved_hb) @@ -618,6 +651,7 @@ void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, if (pampd != NULL) { BUG_ON(obj == NULL); (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1); + (*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool)); } else if (delete) { BUG_ON(obj == NULL); (void)tmem_pampd_delete_from_obj(obj, index); @@ -625,6 +659,9 @@ void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, spin_unlock(&hb->lock); } +/* + * For ramster only. Helper function to support asynchronous tmem_get. + */ static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb, struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, bool free, char *data) @@ -633,7 +670,6 @@ static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb, bool intransit = false; int ret = 0; - if (!is_ephemeral(pool)) new_pampd = (*tmem_pamops.repatriate_preload)( old_pampd, pool, oidp, index, &intransit); @@ -646,60 +682,91 @@ static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb, if (!intransit) ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool, oidp, index, free, data); + if (ret == -EAGAIN) { + /* rare I think, but should cond_resched()??? */ + usleep_range(10, 1000); + } else if (ret == -ENOTCONN || ret == -EHOSTDOWN) { + ret = -1; + } else if (ret != 0 && ret != -ENOENT) { + ret = -1; + } + /* note hb->lock has now been unlocked */ + return ret; +} + +/* + * For ramster only. If a page in tmem matches the handle, replace the + * page so that any subsequent "get" gets the new page. Returns 0 if + * there was a page to replace, else returns -1. + */ +int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, + uint32_t index, void *new_pampd) +{ + struct tmem_obj *obj; + int ret = -1; + struct tmem_hashbucket *hb; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0); + /* if we bug here, pamops wasn't properly set up for ramster */ + BUG_ON(tmem_pamops.replace_in_obj == NULL); + ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); +out: + spin_unlock(&hb->lock); return ret; } +#endif /* - * "Get" a page, e.g. if one can be found, copy the tmem page with the - * matching handle from PAM space to the kernel. By tmem definition, - * when a "get" is successful on an ephemeral page, the page is "flushed", - * and when a "get" is successful on a persistent page, the page is retained - * in tmem. Note that to preserve + * "Get" a page, e.g. if a pampd can be found matching the passed handle, + * use a pamops callback to recreated the page from the pampd with the + * matching handle. By tmem definition, when a "get" is successful on + * an ephemeral page, the page is "flushed", and when a "get" is successful + * on a persistent page, the page is retained in tmem. Note that to preserve * coherency, "get" can never be skipped if tmem contains the data. * That is, if a get is done with a certain handle and fails, any * subsequent "get" must also fail (unless of course there is a * "put" done with the same handle). - */ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t *size, bool raw, int get_and_free) + char *data, size_t *sizep, bool raw, int get_and_free) { struct tmem_obj *obj; - void *pampd; + void *pampd = NULL; bool ephemeral = is_ephemeral(pool); int ret = -1; struct tmem_hashbucket *hb; bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); - bool lock_held = 0; + bool lock_held = false; void **ppampd; -again: - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - lock_held = 1; - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - ppampd = __tmem_pampd_lookup_in_obj(obj, index); - if (ppampd == NULL) - goto out; - if (tmem_pamops.is_remote(*ppampd)) { - ret = tmem_repatriate(ppampd, hb, pool, oidp, - index, free, data); - lock_held = 0; /* note hb->lock has been unlocked */ - if (ret == -EAGAIN) { - /* rare I think, but should cond_resched()??? */ - usleep_range(10, 1000); - goto again; - } else if (ret != 0) { - if (ret != -ENOENT) - pr_err("UNTESTED case in tmem_get, ret=%d\n", - ret); - ret = -1; + do { + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + lock_held = true; + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + ppampd = __tmem_pampd_lookup_in_obj(obj, index); + if (ppampd == NULL) goto out; +#ifdef CONFIG_RAMSTER + if ((tmem_pamops.is_remote != NULL) && + tmem_pamops.is_remote(*ppampd)) { + ret = tmem_repatriate(ppampd, hb, pool, oidp, + index, free, data); + /* tmem_repatriate releases hb->lock */ + lock_held = false; + *sizep = PAGE_SIZE; + if (ret != -EAGAIN) + goto out; } - goto out; - } +#endif + } while (ret == -EAGAIN); if (free) pampd = tmem_pampd_delete_from_obj(obj, index); else @@ -715,10 +782,10 @@ again: } if (free) ret = (*tmem_pamops.get_data_and_free)( - data, size, raw, pampd, pool, oidp, index); + data, sizep, raw, pampd, pool, oidp, index); else ret = (*tmem_pamops.get_data)( - data, size, raw, pampd, pool, oidp, index); + data, sizep, raw, pampd, pool, oidp, index); if (ret < 0) goto out; ret = 0; @@ -762,30 +829,6 @@ out: } /* - * If a page in tmem matches the handle, replace the page so that any - * subsequent "get" gets the new page. Returns the new page if - * there was a page to replace, else returns NULL. - */ -int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, void *new_pampd) -{ - struct tmem_obj *obj; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0); - ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); -out: - spin_unlock(&hb->lock); - return ret; -} - -/* * "Flush" all pages in tmem matching this oid. */ int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) @@ -799,7 +842,7 @@ int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) obj = tmem_obj_find(hb, oidp); if (obj == NULL) goto out; - tmem_pampd_destroy_all_in_obj(obj); + tmem_pampd_destroy_all_in_obj(obj, false); tmem_obj_free(obj, hb); (*tmem_hostops.obj_free)(obj, pool); ret = 0; |