diff options
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | kernel/sched.c | 270 | ||||
-rw-r--r-- | kernel/sched_fair.c | 84 | ||||
-rw-r--r-- | kernel/sysctl.c | 18 |
4 files changed, 331 insertions, 45 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index d6eacda765ca..288245f83bd4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1453,6 +1453,10 @@ extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +extern unsigned int sysctl_sched_min_bal_int_shares; +extern unsigned int sysctl_sched_max_bal_int_shares; +#endif int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, diff --git a/kernel/sched.c b/kernel/sched.c index d9585f15043f..86e55a9c2de6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -168,7 +168,43 @@ struct task_group { struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; + + /* + * shares assigned to a task group governs how much of cpu bandwidth + * is allocated to the group. The more shares a group has, the more is + * the cpu bandwidth allocated to it. + * + * For ex, lets say that there are three task groups, A, B and C which + * have been assigned shares 1000, 2000 and 3000 respectively. Then, + * cpu bandwidth allocated by the scheduler to task groups A, B and C + * should be: + * + * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% + * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% + * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% + * + * The weight assigned to a task group's schedulable entities on every + * cpu (task_group.se[a_cpu]->load.weight) is derived from the task + * group's shares. For ex: lets say that task group A has been + * assigned shares of 1000 and there are two CPUs in a system. Then, + * + * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; + * + * Note: It's not necessary that each of a task's group schedulable + * entity have the same weight on all CPUs. If the group + * has 2 of its tasks on CPU0 and 1 task on CPU1, then a + * better distribution of weight could be: + * + * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 + * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 + * + * rebalance_shares() is responsible for distributing the shares of a + * task groups like this among the group's schedulable entities across + * cpus. + * + */ unsigned long shares; + struct rcu_head rcu; }; @@ -188,6 +224,14 @@ static DEFINE_MUTEX(task_group_mutex); /* doms_cur_mutex serializes access to doms_cur[] array */ static DEFINE_MUTEX(doms_cur_mutex); +#ifdef CONFIG_SMP +/* kernel thread that runs rebalance_shares() periodically */ +static struct task_struct *lb_monitor_task; +static int load_balance_monitor(void *unused); +#endif + +static void set_se_shares(struct sched_entity *se, unsigned long shares); + /* Default task group. * Every task in system belong to this group at bootup. */ @@ -202,6 +246,8 @@ struct task_group init_task_group = { # define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif +#define MIN_GROUP_SHARES 2 + static int init_task_group_load = INIT_TASK_GROUP_LOAD; /* return group to which a task belongs */ @@ -6736,6 +6782,21 @@ void __init sched_init_smp(void) if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); + +#ifdef CONFIG_FAIR_GROUP_SCHED + if (nr_cpu_ids == 1) + return; + + lb_monitor_task = kthread_create(load_balance_monitor, NULL, + "group_balance"); + if (!IS_ERR(lb_monitor_task)) { + lb_monitor_task->flags |= PF_NOFREEZE; + wake_up_process(lb_monitor_task); + } else { + printk(KERN_ERR "Could not create load balance monitor thread" + "(error = %ld) \n", PTR_ERR(lb_monitor_task)); + } +#endif } #else void __init sched_init_smp(void) @@ -6988,6 +7049,157 @@ void set_curr_task(int cpu, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_SMP +/* + * distribute shares of all task groups among their schedulable entities, + * to reflect load distrbution across cpus. + */ +static int rebalance_shares(struct sched_domain *sd, int this_cpu) +{ + struct cfs_rq *cfs_rq; + struct rq *rq = cpu_rq(this_cpu); + cpumask_t sdspan = sd->span; + int balanced = 1; + + /* Walk thr' all the task groups that we have */ + for_each_leaf_cfs_rq(rq, cfs_rq) { + int i; + unsigned long total_load = 0, total_shares; + struct task_group *tg = cfs_rq->tg; + + /* Gather total task load of this group across cpus */ + for_each_cpu_mask(i, sdspan) + total_load += tg->cfs_rq[i]->load.weight; + + /* Nothing to do if this group has no load */ + if (!total_load) + continue; + + /* + * tg->shares represents the number of cpu shares the task group + * is eligible to hold on a single cpu. On N cpus, it is + * eligible to hold (N * tg->shares) number of cpu shares. + */ + total_shares = tg->shares * cpus_weight(sdspan); + + /* + * redistribute total_shares across cpus as per the task load + * distribution. + */ + for_each_cpu_mask(i, sdspan) { + unsigned long local_load, local_shares; + + local_load = tg->cfs_rq[i]->load.weight; + local_shares = (local_load * total_shares) / total_load; + if (!local_shares) + local_shares = MIN_GROUP_SHARES; + if (local_shares == tg->se[i]->load.weight) + continue; + + spin_lock_irq(&cpu_rq(i)->lock); + set_se_shares(tg->se[i], local_shares); + spin_unlock_irq(&cpu_rq(i)->lock); + balanced = 0; + } + } + + return balanced; +} + +/* + * How frequently should we rebalance_shares() across cpus? + * + * The more frequently we rebalance shares, the more accurate is the fairness + * of cpu bandwidth distribution between task groups. However higher frequency + * also implies increased scheduling overhead. + * + * sysctl_sched_min_bal_int_shares represents the minimum interval between + * consecutive calls to rebalance_shares() in the same sched domain. + * + * sysctl_sched_max_bal_int_shares represents the maximum interval between + * consecutive calls to rebalance_shares() in the same sched domain. + * + * These settings allows for the appropriate tradeoff between accuracy of + * fairness and the associated overhead. + * + */ + +/* default: 8ms, units: milliseconds */ +const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; + +/* default: 128ms, units: milliseconds */ +const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; + +/* kernel thread that runs rebalance_shares() periodically */ +static int load_balance_monitor(void *unused) +{ + unsigned int timeout = sysctl_sched_min_bal_int_shares; + struct sched_param schedparm; + int ret; + + /* + * We don't want this thread's execution to be limited by the shares + * assigned to default group (init_task_group). Hence make it run + * as a SCHED_RR RT task at the lowest priority. + */ + schedparm.sched_priority = 1; + ret = sched_setscheduler(current, SCHED_RR, &schedparm); + if (ret) + printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" + " monitor thread (error = %d) \n", ret); + + while (!kthread_should_stop()) { + int i, cpu, balanced = 1; + + /* Prevent cpus going down or coming up */ + lock_cpu_hotplug(); + /* lockout changes to doms_cur[] array */ + lock_doms_cur(); + /* + * Enter a rcu read-side critical section to safely walk rq->sd + * chain on various cpus and to walk task group list + * (rq->leaf_cfs_rq_list) in rebalance_shares(). + */ + rcu_read_lock(); + + for (i = 0; i < ndoms_cur; i++) { + cpumask_t cpumap = doms_cur[i]; + struct sched_domain *sd = NULL, *sd_prev = NULL; + + cpu = first_cpu(cpumap); + + /* Find the highest domain at which to balance shares */ + for_each_domain(cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + sd_prev = sd; + } + + sd = sd_prev; + /* sd == NULL? No load balance reqd in this domain */ + if (!sd) + continue; + + balanced &= rebalance_shares(sd, cpu); + } + + rcu_read_unlock(); + + unlock_doms_cur(); + unlock_cpu_hotplug(); + + if (!balanced) + timeout = sysctl_sched_min_bal_int_shares; + else if (timeout < sysctl_sched_max_bal_int_shares) + timeout *= 2; + + msleep_interruptible(timeout); + } + + return 0; +} +#endif /* CONFIG_SMP */ + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(void) { @@ -7144,47 +7356,77 @@ done: task_rq_unlock(rq, &flags); } +/* rq->lock to be locked by caller */ static void set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; struct rq *rq = cfs_rq->rq; int on_rq; - spin_lock_irq(&rq->lock); + if (!shares) + shares = MIN_GROUP_SHARES; on_rq = se->on_rq; - if (on_rq) + if (on_rq) { dequeue_entity(cfs_rq, se, 0); + dec_cpu_load(rq, se->load.weight); + } se->load.weight = shares; se->load.inv_weight = div64_64((1ULL<<32), shares); - if (on_rq) + if (on_rq) { enqueue_entity(cfs_rq, se, 0); - - spin_unlock_irq(&rq->lock); + inc_cpu_load(rq, se->load.weight); + } } int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - - /* - * A weight of 0 or 1 can cause arithmetics problems. - * (The default weight is 1024 - so there's no practical - * limitation from this.) - */ - if (shares < 2) - shares = 2; + struct cfs_rq *cfs_rq; + struct rq *rq; lock_task_group_list(); if (tg->shares == shares) goto done; + if (shares < MIN_GROUP_SHARES) + shares = MIN_GROUP_SHARES; + + /* + * Prevent any load balance activity (rebalance_shares, + * load_balance_fair) from referring to this group first, + * by taking it off the rq->leaf_cfs_rq_list on each cpu. + */ + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + } + + /* wait for any ongoing reference to this group to finish */ + synchronize_sched(); + + /* + * Now we are free to modify the group's share on each cpu + * w/o tripping rebalance_share or load_balance_fair. + */ tg->shares = shares; - for_each_possible_cpu(i) + for_each_possible_cpu(i) { + spin_lock_irq(&cpu_rq(i)->lock); set_se_shares(tg->se[i], shares); + spin_unlock_irq(&cpu_rq(i)->lock); + } + /* + * Enable load balance activity on this group, by inserting it back on + * each cpu's rq->leaf_cfs_rq_list. + */ + for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = tg->cfs_rq[i]; + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + } done: unlock_task_group_list(); return 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 30ae9c2a2861..5c208e090ae4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -707,6 +707,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } +#define GROUP_IMBALANCE_PCT 20 + #else /* CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ @@ -967,25 +969,6 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); } -#ifdef CONFIG_FAIR_GROUP_SCHED -static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) -{ - struct sched_entity *curr; - struct task_struct *p; - - if (!cfs_rq->nr_running) - return MAX_PRIO; - - curr = cfs_rq->curr; - if (!curr) - curr = __pick_next_entity(cfs_rq); - - p = task_of(curr); - - return p->prio; -} -#endif - static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -995,28 +978,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; struct rq_iterator cfs_rq_iterator; + unsigned long load_moved; cfs_rq_iterator.start = load_balance_start_fair; cfs_rq_iterator.next = load_balance_next_fair; for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { #ifdef CONFIG_FAIR_GROUP_SCHED - struct cfs_rq *this_cfs_rq; - long imbalance; - unsigned long maxload; + struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; + unsigned long maxload, task_load, group_weight; + unsigned long thisload, per_task_load; + struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; - this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); + task_load = busy_cfs_rq->load.weight; + group_weight = se->load.weight; - imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; - /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ - if (imbalance <= 0) + /* + * 'group_weight' is contributed by tasks of total weight + * 'task_load'. To move 'rem_load_move' worth of weight only, + * we need to move a maximum task load of: + * + * maxload = (remload / group_weight) * task_load; + */ + maxload = (rem_load_move * task_load) / group_weight; + + if (!maxload || !task_load) continue; - /* Don't pull more than imbalance/2 */ - imbalance /= 2; - maxload = min(rem_load_move, imbalance); + per_task_load = task_load / busy_cfs_rq->nr_running; + /* + * balance_tasks will try to forcibly move atleast one task if + * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if + * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. + */ + if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) + continue; - *this_best_prio = cfs_rq_best_prio(this_cfs_rq); + /* Disable priority-based load balance */ + *this_best_prio = 0; + thisload = this_cfs_rq->load.weight; #else # define maxload rem_load_move #endif @@ -1025,11 +1025,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, * load_balance_[start|next]_fair iterators */ cfs_rq_iterator.arg = busy_cfs_rq; - rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, + load_moved = balance_tasks(this_rq, this_cpu, busiest, maxload, sd, idle, all_pinned, this_best_prio, &cfs_rq_iterator); +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * load_moved holds the task load that was moved. The + * effective (group) weight moved would be: + * load_moved_eff = load_moved/task_load * group_weight; + */ + load_moved = (group_weight * load_moved) / task_load; + + /* Adjust shares on both cpus to reflect load_moved */ + group_weight -= load_moved; + set_se_shares(se, group_weight); + + se = busy_cfs_rq->tg->se[this_cpu]; + if (!thisload) + group_weight = load_moved; + else + group_weight = se->load.weight + load_moved; + set_se_shares(se, group_weight); +#endif + + rem_load_move -= load_moved; + if (rem_load_move <= 0) break; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c68f68dcc605..c95f3ed34474 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -309,6 +309,24 @@ static struct ctl_table kern_table[] = { .mode = 644, .proc_handler = &proc_dointvec, }, +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_min_bal_int_shares", + .data = &sysctl_sched_min_bal_int_shares, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_max_bal_int_shares", + .data = &sysctl_sched_max_bal_int_shares, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #endif { .ctl_name = CTL_UNNUMBERED, |