summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/cgroup.h16
-rw-r--r--kernel/cgroup.c223
2 files changed, 131 insertions, 108 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 528e2aed36c3..3a1cb265afd6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -346,6 +346,22 @@ struct css_set {
*/
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+ /*
+ * List of csets participating in the on-going migration either as
+ * source or destination. Protected by cgroup_mutex.
+ */
+ struct list_head mg_node;
+
+ /*
+ * If this cset is acting as the source of migration the following
+ * two fields are set. mg_src_cgrp is the source cgroup of the
+ * on-going migration and mg_dst_cset is the destination cset the
+ * target tasks on this cset should be migrated to. Protected by
+ * cgroup_mutex.
+ */
+ struct cgroup *mg_src_cgrp;
+ struct css_set *mg_dst_cset;
+
/* For RCU-protected deletion */
struct rcu_head rcu_head;
};
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b80c611ff836..5def4a800425 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,6 @@
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
#include <linux/delay.h>
@@ -645,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->mg_node);
INIT_HLIST_NODE(&cset->hlist);
/* Copy the set of subsystem state objects generated in
@@ -1639,20 +1639,26 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
-/*
- * Control Group taskset
- */
-struct task_and_cgroup {
- struct task_struct *task;
- struct cgroup *cgrp;
- struct css_set *cset;
-};
-
+/* used to track tasks and other necessary states during migration */
struct cgroup_taskset {
- struct task_and_cgroup single;
- struct flex_array *tc_array;
- int tc_array_len;
- int idx;
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
+
+ /*
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
+ */
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
};
/**
@@ -1663,12 +1669,10 @@ struct cgroup_taskset {
*/
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
{
- if (tset->tc_array) {
- tset->idx = 0;
- return cgroup_taskset_next(tset);
- } else {
- return tset->single.task;
- }
+ tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+ tset->cur_task = NULL;
+
+ return cgroup_taskset_next(tset);
}
/**
@@ -1680,13 +1684,27 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
*/
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
{
- struct task_and_cgroup *tc;
+ struct css_set *cset = tset->cur_cset;
+ struct task_struct *task = tset->cur_task;
- if (!tset->tc_array || tset->idx >= tset->tc_array_len)
- return NULL;
+ while (&cset->mg_node != tset->csets) {
+ if (!task)
+ task = list_first_entry(&cset->mg_tasks,
+ struct task_struct, cg_list);
+ else
+ task = list_next_entry(task, cg_list);
- tc = flex_array_get(tset->tc_array, tset->idx++);
- return tc->task;
+ if (&task->cg_list != &cset->mg_tasks) {
+ tset->cur_cset = cset;
+ tset->cur_task = task;
+ return task;
+ }
+
+ cset = list_next_entry(cset, mg_node);
+ task = NULL;
+ }
+
+ return NULL;
}
/**
@@ -1714,11 +1732,13 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
WARN_ON_ONCE(tsk->flags & PF_EXITING);
old_cset = task_css_set(tsk);
+ get_css_set(new_cset);
+
task_lock(tsk);
rcu_assign_pointer(tsk->cgroups, new_cset);
task_unlock(tsk);
- list_move(&tsk->cg_list, &new_cset->tasks);
+ list_move(&tsk->cg_list, &new_cset->mg_tasks);
/*
* We just gained a reference on old_cset by taking it from the
@@ -1741,80 +1761,58 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
bool threadgroup)
{
- int ret, i, group_size;
- struct cgroupfs_root *root = cgrp->root;
+ struct cgroup_taskset tset = {
+ .src_csets = LIST_HEAD_INIT(tset.src_csets),
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
+ .csets = &tset.src_csets,
+ };
struct cgroup_subsys_state *css, *failed_css = NULL;
- /* threadgroup list cursor and array */
- struct task_struct *task;
- struct task_and_cgroup *tc;
- struct flex_array *group;
- struct cgroup_taskset tset = { };
-
- /*
- * step 0: in order to do expensive, possibly blocking operations for
- * every thread, we cannot iterate the thread group list, since it needs
- * rcu or tasklist locked. instead, build an array of all threads in the
- * group - group_rwsem prevents new threads from appearing, and if
- * threads exit, this will just be an over-estimate.
- */
- if (threadgroup)
- group_size = get_nr_threads(leader);
- else
- group_size = 1;
- /* flex_array supports very large thread-groups better than kmalloc. */
- group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
- if (!group)
- return -ENOMEM;
- /* pre-allocate to guarantee space while iterating in rcu read-side. */
- ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
- if (ret)
- goto out_free_group_list;
+ struct css_set *cset, *tmp_cset;
+ struct task_struct *task, *tmp_task;
+ int i, ret;
- i = 0;
/*
* Prevent freeing of tasks while we take a snapshot. Tasks that are
* already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock.
*/
- down_read(&css_set_rwsem);
+ down_write(&css_set_rwsem);
rcu_read_lock();
task = leader;
do {
- struct task_and_cgroup ent;
+ struct cgroup *src_cgrp;
/* @task either already exited or can't exit until the end */
if (task->flags & PF_EXITING)
goto next;
- /* as per above, nr_threads may decrease, but not increase. */
- BUG_ON(i >= group_size);
- ent.task = task;
- ent.cgrp = task_cgroup_from_root(task, root);
+ cset = task_css_set(task);
+ src_cgrp = task_cgroup_from_root(task, cgrp->root);
+
/* nothing to do if this task is already in the cgroup */
- if (ent.cgrp == cgrp)
+ if (src_cgrp == cgrp)
goto next;
- /*
- * saying GFP_ATOMIC has no effect here because we did prealloc
- * earlier, but it's good form to communicate our expectations.
- */
- ret = flex_array_put(group, i, &ent, GFP_ATOMIC);
- BUG_ON(ret != 0);
- i++;
+
+ if (!cset->mg_src_cgrp) {
+ WARN_ON(!list_empty(&cset->mg_tasks));
+ WARN_ON(!list_empty(&cset->mg_node));
+
+ cset->mg_src_cgrp = src_cgrp;
+ list_add(&cset->mg_node, &tset.src_csets);
+ get_css_set(cset);
+ }
+
+ list_move(&task->cg_list, &cset->mg_tasks);
next:
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
- up_read(&css_set_rwsem);
- /* remember the number of threads in the array for later. */
- group_size = i;
- tset.tc_array = group;
- tset.tc_array_len = group_size;
+ up_write(&css_set_rwsem);
/* methods shouldn't be called if no task is actually migrating */
- ret = 0;
- if (!group_size)
- goto out_free_group_list;
+ if (list_empty(&tset.src_csets))
+ return 0;
/*
* step 1: check that we can legitimately attach to the cgroup.
@@ -1833,16 +1831,21 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
* step 2: make sure css_sets exist for all threads to be migrated.
* we use find_css_set, which allocates a new one if necessary.
*/
- for (i = 0; i < group_size; i++) {
- struct css_set *old_cset;
+ list_for_each_entry(cset, &tset.src_csets, mg_node) {
+ struct css_set *dst_cset;
- tc = flex_array_get(group, i);
- old_cset = task_css_set(tc->task);
- tc->cset = find_css_set(old_cset, cgrp);
- if (!tc->cset) {
+ dst_cset = find_css_set(cset, cgrp);
+ if (!dst_cset) {
ret = -ENOMEM;
- goto out_put_css_set_refs;
+ goto out_release_tset;
}
+
+ if (list_empty(&dst_cset->mg_node))
+ list_add(&dst_cset->mg_node, &tset.dst_csets);
+ else
+ put_css_set(dst_cset, false);
+
+ cset->mg_dst_cset = dst_cset;
}
/*
@@ -1851,12 +1854,17 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
* failure cases after here, so this is the commit point.
*/
down_write(&css_set_rwsem);
- for (i = 0; i < group_size; i++) {
- tc = flex_array_get(group, i);
- cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
+ list_for_each_entry(cset, &tset.src_csets, mg_node) {
+ list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
+ cgroup_task_migrate(cset->mg_src_cgrp, task,
+ cset->mg_dst_cset);
}
up_write(&css_set_rwsem);
- /* nothing is sensitive to fork() after this point. */
+
+ /* migration is committed, all target tasks are now on dst_csets */
+ tset.csets = &tset.dst_csets;
+
+ /* nothing is sensitive to fork() after this point */
/*
* step 4: do subsystem attach callbacks.
@@ -1865,30 +1873,27 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
if (css->ss->attach)
css->ss->attach(css, &tset);
- /*
- * step 5: success! and cleanup
- */
ret = 0;
-out_put_css_set_refs:
- if (ret) {
- for (i = 0; i < group_size; i++) {
- tc = flex_array_get(group, i);
- if (!tc->cset)
- break;
- put_css_set(tc->cset, false);
- }
- }
+ goto out_release_tset;
+
out_cancel_attach:
- if (ret) {
- for_each_css(css, i, cgrp) {
- if (css == failed_css)
- break;
- if (css->ss->cancel_attach)
- css->ss->cancel_attach(css, &tset);
- }
+ for_each_css(css, i, cgrp) {
+ if (css == failed_css)
+ break;
+ if (css->ss->cancel_attach)
+ css->ss->cancel_attach(css, &tset);
}
-out_free_group_list:
- flex_array_free(group);
+out_release_tset:
+ down_write(&css_set_rwsem);
+ list_splice_init(&tset.dst_csets, &tset.src_csets);
+ list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
+ list_splice_init(&cset->mg_tasks, &cset->tasks);
+ cset->mg_dst_cset = NULL;
+ cset->mg_src_cgrp = NULL;
+ list_del_init(&cset->mg_node);
+ put_css_set_locked(cset, false);
+ }
+ up_write(&css_set_rwsem);
return ret;
}
@@ -3895,6 +3900,8 @@ int __init cgroup_init_early(void)
atomic_set(&init_css_set.refcount, 1);
INIT_LIST_HEAD(&init_css_set.cgrp_links);
INIT_LIST_HEAD(&init_css_set.tasks);
+ INIT_LIST_HEAD(&init_css_set.mg_tasks);
+ INIT_LIST_HEAD(&init_css_set.mg_node);
INIT_HLIST_NODE(&init_css_set.hlist);
css_set_count = 1;
init_cgroup_root(&cgroup_dummy_root);