summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIlya Dryomov <ilya.dryomov@inktank.com>2014-03-19 16:58:37 +0200
committerSage Weil <sage@inktank.com>2014-04-04 21:07:26 -0700
commite2b149cc4ba00766aceb87950c6de72ea7fc8b2e (patch)
tree5f3d7b5dd55b7f75c412db786e1e6f4915ef9ed8
parent6ed1002f368c63ef79d7f659fcb4368a90098132 (diff)
crush: add chooseleaf_vary_r tunable
The current crush_choose_firstn code will re-use the same 'r' value for the recursive call. That means that if we are hitting a collision or rejection for some reason (say, an OSD that is marked out) and need to retry, we will keep making the same (bad) choice in that recursive selection. Introduce a tunable that fixes that behavior by incorporating the parent 'r' value into the recursive starting point, so that a different path will be taken in subsequent placement attempts. Note that this was done from the get-go for the new crush_choose_indep algorithm. This was exposed by a user who was seeing PGs stuck in active+remapped after reweight-by-utilization because the up set mapped to a single OSD. Reflects ceph.git commit a8e6c9fbf88bad056dd05d3eb790e98a5e43451a. Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
-rw-r--r--include/linux/crush/crush.h6
-rw-r--r--net/ceph/crush/mapper.c30
2 files changed, 30 insertions, 6 deletions
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index acaa5615d634..75f36a6c7f67 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -173,6 +173,12 @@ struct crush_map {
* apply to a collision: in that case we will retry as we used
* to. */
__u32 chooseleaf_descend_once;
+
+ /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
+ * bits. a value of 1 is best for new clusters. for legacy clusters
+ * that want to limit reshuffling, a value of 3 or 4 will make the
+ * mappings line up a bit better with previous mappings. */
+ __u8 chooseleaf_vary_r;
};
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b3fb84903b30..947150cde297 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -295,7 +295,9 @@ static int is_out(const struct crush_map *map,
* @local_retries: localized retries
* @local_fallback_retries: localized fallback retries
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @vary_r: pass r to recursive calls
* @out2: second output vector for leaf items (if @recurse_to_leaf)
+ * @parent_r: r value passed from the parent
*/
static int crush_choose_firstn(const struct crush_map *map,
struct crush_bucket *bucket,
@@ -307,7 +309,9 @@ static int crush_choose_firstn(const struct crush_map *map,
unsigned int local_retries,
unsigned int local_fallback_retries,
int recurse_to_leaf,
- int *out2)
+ unsigned int vary_r,
+ int *out2,
+ int parent_r)
{
int rep;
unsigned int ftotal, flocal;
@@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map,
int itemtype;
int collide, reject;
- dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
- bucket->id, x, outpos, numrep);
+ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
+ recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep,
+ tries, recurse_tries, local_retries, local_fallback_retries,
+ parent_r);
for (rep = outpos; rep < numrep; rep++) {
/* keep trying until we get a non-out, non-colliding item */
@@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map,
do {
collide = 0;
retry_bucket = 0;
- r = rep;
+ r = rep + parent_r;
/* r' = r + f_total */
r += ftotal;
@@ -387,6 +394,11 @@ static int crush_choose_firstn(const struct crush_map *map,
reject = 0;
if (!collide && recurse_to_leaf) {
if (item < 0) {
+ int sub_r;
+ if (vary_r)
+ sub_r = r >> (vary_r-1);
+ else
+ sub_r = 0;
if (crush_choose_firstn(map,
map->buckets[-1-item],
weight, weight_max,
@@ -396,7 +408,9 @@ static int crush_choose_firstn(const struct crush_map *map,
local_retries,
local_fallback_retries,
0,
- NULL) <= outpos)
+ vary_r,
+ NULL,
+ sub_r) <= outpos)
/* didn't get leaf */
reject = 1;
} else {
@@ -653,6 +667,8 @@ int crush_do_rule(const struct crush_map *map,
int choose_local_retries = map->choose_local_tries;
int choose_local_fallback_retries = map->choose_local_fallback_tries;
+ int vary_r = map->chooseleaf_vary_r;
+
if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno);
return 0;
@@ -745,7 +761,9 @@ int crush_do_rule(const struct crush_map *map,
choose_local_retries,
choose_local_fallback_retries,
recurse_to_leaf,
- c+osize);
+ vary_r,
+ c+osize,
+ 0);
} else {
crush_choose_indep(
map,